973 lines
36 KiB
Text
973 lines
36 KiB
Text
|
Index: src/arm/64/ipred.S
|
||
|
--- src/arm/64/ipred.S.orig
|
||
|
+++ src/arm/64/ipred.S
|
||
|
@@ -34,11 +34,11 @@
|
||
|
// const int max_width, const int max_height);
|
||
|
function ipred_dc_128_8bpc_neon, export=1
|
||
|
clz w3, w3
|
||
|
- adr x5, L(ipred_dc_128_tbl)
|
||
|
+ adrp x5, L(ipred_dc_128_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_dc_128_tbl)
|
||
|
sub w3, w3, #25
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
+ ldr x5, [x5, w3, uxtw #3]
|
||
|
movi v0.16b, #128
|
||
|
- sub x5, x5, w3, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -94,12 +94,14 @@ function ipred_dc_128_8bpc_neon, export=1
|
||
|
b.gt 64b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_dc_128_tbl):
|
||
|
- .hword L(ipred_dc_128_tbl) - 640b
|
||
|
- .hword L(ipred_dc_128_tbl) - 320b
|
||
|
- .hword L(ipred_dc_128_tbl) - 16b
|
||
|
- .hword L(ipred_dc_128_tbl) - 8b
|
||
|
- .hword L(ipred_dc_128_tbl) - 4b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 16b
|
||
|
+ .xword 8b
|
||
|
+ .xword 4b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -108,11 +110,11 @@ endfunc
|
||
|
// const int max_width, const int max_height);
|
||
|
function ipred_v_8bpc_neon, export=1
|
||
|
clz w3, w3
|
||
|
- adr x5, L(ipred_v_tbl)
|
||
|
+ adrp x5, L(ipred_v_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_v_tbl)
|
||
|
sub w3, w3, #25
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
+ ldr x5, [x5, w3, uxtw #3]
|
||
|
add x2, x2, #1
|
||
|
- sub x5, x5, w3, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -172,12 +174,14 @@ function ipred_v_8bpc_neon, export=1
|
||
|
b.gt 64b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_v_tbl):
|
||
|
- .hword L(ipred_v_tbl) - 640b
|
||
|
- .hword L(ipred_v_tbl) - 320b
|
||
|
- .hword L(ipred_v_tbl) - 160b
|
||
|
- .hword L(ipred_v_tbl) - 80b
|
||
|
- .hword L(ipred_v_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -186,11 +190,11 @@ endfunc
|
||
|
// const int max_width, const int max_height);
|
||
|
function ipred_h_8bpc_neon, export=1
|
||
|
clz w3, w3
|
||
|
- adr x5, L(ipred_h_tbl)
|
||
|
+ adrp x5, L(ipred_h_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_h_tbl)
|
||
|
sub w3, w3, #25
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
+ ldr x5, [x5, w3, uxtw #3]
|
||
|
sub x2, x2, #4
|
||
|
- sub x5, x5, w3, uxtw
|
||
|
mov x7, #-4
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
@@ -258,12 +262,14 @@ function ipred_h_8bpc_neon, export=1
|
||
|
b.gt 64b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_h_tbl):
|
||
|
- .hword L(ipred_h_tbl) - 64b
|
||
|
- .hword L(ipred_h_tbl) - 32b
|
||
|
- .hword L(ipred_h_tbl) - 16b
|
||
|
- .hword L(ipred_h_tbl) - 8b
|
||
|
- .hword L(ipred_h_tbl) - 4b
|
||
|
+ .xword 64b
|
||
|
+ .xword 32b
|
||
|
+ .xword 16b
|
||
|
+ .xword 8b
|
||
|
+ .xword 4b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -272,11 +278,11 @@ endfunc
|
||
|
// const int max_width, const int max_height);
|
||
|
function ipred_dc_top_8bpc_neon, export=1
|
||
|
clz w3, w3
|
||
|
- adr x5, L(ipred_dc_top_tbl)
|
||
|
+ adrp x5, L(ipred_dc_top_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_dc_top_tbl)
|
||
|
sub w3, w3, #25
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
+ ldr x5, [x5, w3, uxtw #3]
|
||
|
add x2, x2, #1
|
||
|
- sub x5, x5, w3, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -363,12 +369,14 @@ function ipred_dc_top_8bpc_neon, export=1
|
||
|
b.gt 64b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_dc_top_tbl):
|
||
|
- .hword L(ipred_dc_top_tbl) - 640b
|
||
|
- .hword L(ipred_dc_top_tbl) - 320b
|
||
|
- .hword L(ipred_dc_top_tbl) - 160b
|
||
|
- .hword L(ipred_dc_top_tbl) - 80b
|
||
|
- .hword L(ipred_dc_top_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -379,13 +387,12 @@ function ipred_dc_left_8bpc_neon, export=1
|
||
|
sub x2, x2, w4, uxtw
|
||
|
clz w3, w3
|
||
|
clz w7, w4
|
||
|
- adr x5, L(ipred_dc_left_tbl)
|
||
|
+ adrp x5, L(ipred_dc_left_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_dc_left_tbl)
|
||
|
sub w3, w3, #20 // 25 leading bits, minus table offset 5
|
||
|
sub w7, w7, #25
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
- ldrh w7, [x5, w7, uxtw #1]
|
||
|
- sub x3, x5, w3, uxtw
|
||
|
- sub x5, x5, w7, uxtw
|
||
|
+ ldr x3, [x5, w3, uxtw #3]
|
||
|
+ ldr x5, [x5, w7, uxtw #3]
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -489,17 +496,19 @@ L(ipred_dc_left_w64):
|
||
|
b.gt 1b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_dc_left_tbl):
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
|
||
|
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
|
||
|
+ .xword L(ipred_dc_left_h64)
|
||
|
+ .xword L(ipred_dc_left_h32)
|
||
|
+ .xword L(ipred_dc_left_h16)
|
||
|
+ .xword L(ipred_dc_left_h8)
|
||
|
+ .xword L(ipred_dc_left_h4)
|
||
|
+ .xword L(ipred_dc_left_w64)
|
||
|
+ .xword L(ipred_dc_left_w32)
|
||
|
+ .xword L(ipred_dc_left_w16)
|
||
|
+ .xword L(ipred_dc_left_w8)
|
||
|
+ .xword L(ipred_dc_left_w4)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -512,16 +521,15 @@ function ipred_dc_8bpc_neon, export=1
|
||
|
clz w3, w3
|
||
|
clz w6, w4
|
||
|
dup v16.8h, w7 // width + height
|
||
|
- adr x5, L(ipred_dc_tbl)
|
||
|
+ adrp x5, L(ipred_dc_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_dc_tbl)
|
||
|
rbit w7, w7 // rbit(width + height)
|
||
|
sub w3, w3, #20 // 25 leading bits, minus table offset 5
|
||
|
sub w6, w6, #25
|
||
|
clz w7, w7 // ctz(width + height)
|
||
|
- ldrh w3, [x5, w3, uxtw #1]
|
||
|
- ldrh w6, [x5, w6, uxtw #1]
|
||
|
+ ldr x3, [x5, w3, uxtw #3]
|
||
|
+ ldr x5, [x5, w6, uxtw #3]
|
||
|
neg w7, w7 // -ctz(width + height)
|
||
|
- sub x3, x5, w3, uxtw
|
||
|
- sub x5, x5, w6, uxtw
|
||
|
ushr v16.8h, v16.8h, #1 // (width + height) >> 1
|
||
|
dup v17.8h, w7 // -ctz(width + height)
|
||
|
add x6, x0, x1
|
||
|
@@ -714,17 +722,19 @@ L(ipred_dc_w64):
|
||
|
b.gt 2b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_dc_tbl):
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
|
||
|
- .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
|
||
|
+ .xword L(ipred_dc_h64)
|
||
|
+ .xword L(ipred_dc_h32)
|
||
|
+ .xword L(ipred_dc_h16)
|
||
|
+ .xword L(ipred_dc_h8)
|
||
|
+ .xword L(ipred_dc_h4)
|
||
|
+ .xword L(ipred_dc_w64)
|
||
|
+ .xword L(ipred_dc_w32)
|
||
|
+ .xword L(ipred_dc_w16)
|
||
|
+ .xword L(ipred_dc_w8)
|
||
|
+ .xword L(ipred_dc_w4)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -733,13 +743,13 @@ endfunc
|
||
|
// const int max_width, const int max_height);
|
||
|
function ipred_paeth_8bpc_neon, export=1
|
||
|
clz w9, w3
|
||
|
- adr x5, L(ipred_paeth_tbl)
|
||
|
+ adrp x5, L(ipred_paeth_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_paeth_tbl)
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x5, w9, uxtw #1]
|
||
|
+ ldr x5, [x5, w9, uxtw #3]
|
||
|
ld1r {v4.16b}, [x2]
|
||
|
add x8, x2, #1
|
||
|
sub x2, x2, #4
|
||
|
- sub x5, x5, w9, uxtw
|
||
|
mov x7, #-4
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
@@ -899,12 +909,14 @@ function ipred_paeth_8bpc_neon, export=1
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_paeth_tbl):
|
||
|
- .hword L(ipred_paeth_tbl) - 640b
|
||
|
- .hword L(ipred_paeth_tbl) - 320b
|
||
|
- .hword L(ipred_paeth_tbl) - 160b
|
||
|
- .hword L(ipred_paeth_tbl) - 80b
|
||
|
- .hword L(ipred_paeth_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -916,13 +928,13 @@ function ipred_smooth_8bpc_neon, export=1
|
||
|
add x11, x10, w4, uxtw
|
||
|
add x10, x10, w3, uxtw
|
||
|
clz w9, w3
|
||
|
- adr x5, L(ipred_smooth_tbl)
|
||
|
+ adrp x5, L(ipred_smooth_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_smooth_tbl)
|
||
|
sub x12, x2, w4, uxtw
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x5, w9, uxtw #1]
|
||
|
+ ldr x5, [x5, w9, uxtw #3]
|
||
|
ld1r {v4.16b}, [x12] // bottom
|
||
|
add x8, x2, #1
|
||
|
- sub x5, x5, w9, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -1080,12 +1092,14 @@ function ipred_smooth_8bpc_neon, export=1
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_smooth_tbl):
|
||
|
- .hword L(ipred_smooth_tbl) - 640b
|
||
|
- .hword L(ipred_smooth_tbl) - 320b
|
||
|
- .hword L(ipred_smooth_tbl) - 160b
|
||
|
- .hword L(ipred_smooth_tbl) - 80b
|
||
|
- .hword L(ipred_smooth_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -1096,13 +1110,13 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||
|
movrel x7, X(sm_weights)
|
||
|
add x7, x7, w4, uxtw
|
||
|
clz w9, w3
|
||
|
- adr x5, L(ipred_smooth_v_tbl)
|
||
|
+ adrp x5, L(ipred_smooth_v_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_smooth_v_tbl)
|
||
|
sub x8, x2, w4, uxtw
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x5, w9, uxtw #1]
|
||
|
+ ldr x5, [x5, w9, uxtw #3]
|
||
|
ld1r {v4.16b}, [x8] // bottom
|
||
|
add x2, x2, #1
|
||
|
- sub x5, x5, w9, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -1221,12 +1235,14 @@ function ipred_smooth_v_8bpc_neon, export=1
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_smooth_v_tbl):
|
||
|
- .hword L(ipred_smooth_v_tbl) - 640b
|
||
|
- .hword L(ipred_smooth_v_tbl) - 320b
|
||
|
- .hword L(ipred_smooth_v_tbl) - 160b
|
||
|
- .hword L(ipred_smooth_v_tbl) - 80b
|
||
|
- .hword L(ipred_smooth_v_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -1237,12 +1253,12 @@ function ipred_smooth_h_8bpc_neon, export=1
|
||
|
movrel x8, X(sm_weights)
|
||
|
add x8, x8, w3, uxtw
|
||
|
clz w9, w3
|
||
|
- adr x5, L(ipred_smooth_h_tbl)
|
||
|
+ adrp x5, L(ipred_smooth_h_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_smooth_h_tbl)
|
||
|
add x12, x2, w3, uxtw
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x5, w9, uxtw #1]
|
||
|
+ ldr x5, [x5, w9, uxtw #3]
|
||
|
ld1r {v5.16b}, [x12] // right
|
||
|
- sub x5, x5, w9, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x5
|
||
|
@@ -1367,12 +1383,14 @@ function ipred_smooth_h_8bpc_neon, export=1
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_smooth_h_tbl):
|
||
|
- .hword L(ipred_smooth_h_tbl) - 640b
|
||
|
- .hword L(ipred_smooth_h_tbl) - 320b
|
||
|
- .hword L(ipred_smooth_h_tbl) - 160b
|
||
|
- .hword L(ipred_smooth_h_tbl) - 80b
|
||
|
- .hword L(ipred_smooth_h_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
const padding_mask_buf
|
||
|
@@ -1653,11 +1671,11 @@ endfunc
|
||
|
// const int dx, const int max_base_x);
|
||
|
function ipred_z1_fill1_8bpc_neon, export=1
|
||
|
clz w9, w3
|
||
|
- adr x8, L(ipred_z1_fill1_tbl)
|
||
|
+ adrp x8, L(ipred_z1_fill1_tbl)
|
||
|
+ add x8, x8, :lo12: L(ipred_z1_fill1_tbl)
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x8, w9, uxtw #1]
|
||
|
+ ldr x8, [x8, w9, uxtw #3]
|
||
|
add x10, x2, w6, uxtw // top[max_base_x]
|
||
|
- sub x8, x8, w9, uxtw
|
||
|
ld1r {v31.16b}, [x10] // padding
|
||
|
mov w7, w5
|
||
|
mov w15, #64
|
||
|
@@ -1816,12 +1834,14 @@ function ipred_z1_fill1_8bpc_neon, export=1
|
||
|
mov w3, w12
|
||
|
b 169b
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_z1_fill1_tbl):
|
||
|
- .hword L(ipred_z1_fill1_tbl) - 640b
|
||
|
- .hword L(ipred_z1_fill1_tbl) - 320b
|
||
|
- .hword L(ipred_z1_fill1_tbl) - 160b
|
||
|
- .hword L(ipred_z1_fill1_tbl) - 80b
|
||
|
- .hword L(ipred_z1_fill1_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
function ipred_z1_fill2_8bpc_neon, export=1
|
||
|
@@ -1940,11 +1960,11 @@ endconst
|
||
|
// const int dx, const int dy);
|
||
|
function ipred_z2_fill1_8bpc_neon, export=1
|
||
|
clz w10, w4
|
||
|
- adr x9, L(ipred_z2_fill1_tbl)
|
||
|
+ adrp x9, L(ipred_z2_fill1_tbl)
|
||
|
+ add x9, x9, :lo12: L(ipred_z2_fill1_tbl)
|
||
|
sub w10, w10, #25
|
||
|
- ldrh w10, [x9, w10, uxtw #1]
|
||
|
+ ldr x9, [x9, w10, uxtw #3]
|
||
|
mov w8, #(1 << 6) // xpos = 1 << 6
|
||
|
- sub x9, x9, w10, uxtw
|
||
|
sub w8, w8, w6 // xpos -= dx
|
||
|
|
||
|
movrel x11, increments
|
||
|
@@ -2651,12 +2671,14 @@ function ipred_z2_fill1_8bpc_neon, export=1
|
||
|
ldp d8, d9, [sp], 0x40
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_z2_fill1_tbl):
|
||
|
- .hword L(ipred_z2_fill1_tbl) - 640b
|
||
|
- .hword L(ipred_z2_fill1_tbl) - 320b
|
||
|
- .hword L(ipred_z2_fill1_tbl) - 160b
|
||
|
- .hword L(ipred_z2_fill1_tbl) - 80b
|
||
|
- .hword L(ipred_z2_fill1_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
function ipred_z2_fill2_8bpc_neon, export=1
|
||
|
@@ -3160,11 +3182,11 @@ endfunc
|
||
|
function ipred_z3_fill1_8bpc_neon, export=1
|
||
|
cmp w6, #64
|
||
|
clz w9, w3
|
||
|
- adr x8, L(ipred_z3_fill1_tbl)
|
||
|
+ adrp x8, L(ipred_z3_fill1_tbl)
|
||
|
+ add x8, x8, :lo12: L(ipred_z3_fill1_tbl)
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x8, w9, uxtw #1]
|
||
|
+ ldr x8, [x8, w9, uxtw #3]
|
||
|
add x10, x2, w6, uxtw // left[max_base_y]
|
||
|
- sub x8, x8, w9, uxtw
|
||
|
movrel x11, increments
|
||
|
ld1r {v31.16b}, [x10] // padding
|
||
|
ld1 {v30.8h}, [x11] // increments
|
||
|
@@ -3503,17 +3525,20 @@ L(ipred_z3_fill1_large_h16):
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_z3_fill1_tbl):
|
||
|
- .hword L(ipred_z3_fill1_tbl) - 640b
|
||
|
- .hword L(ipred_z3_fill1_tbl) - 320b
|
||
|
- .hword L(ipred_z3_fill1_tbl) - 160b
|
||
|
- .hword L(ipred_z3_fill1_tbl) - 80b
|
||
|
- .hword L(ipred_z3_fill1_tbl) - 40b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
function ipred_z3_fill_padding_neon, export=0
|
||
|
cmp w3, #16
|
||
|
- adr x8, L(ipred_z3_fill_padding_tbl)
|
||
|
+ adrp x8, L(ipred_z3_fill_padding_tbl)
|
||
|
+ add x8, x8, :lo12: L(ipred_z3_fill_padding_tbl)
|
||
|
b.gt L(ipred_z3_fill_padding_wide)
|
||
|
// w3 = remaining width, w4 = constant height
|
||
|
mov w12, w4
|
||
|
@@ -3524,10 +3549,11 @@ function ipred_z3_fill_padding_neon, export=0
|
||
|
// power of two in the remaining width, and repeating.
|
||
|
clz w9, w3
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x8, w9, uxtw #1]
|
||
|
- sub x9, x8, w9, uxtw
|
||
|
+ ldr x9, [x8, w9, uxtw #3]
|
||
|
br x9
|
||
|
|
||
|
+20:
|
||
|
+ AARCH64_VALID_JUMP_TARGET
|
||
|
2:
|
||
|
st1 {v31.h}[0], [x0], x1
|
||
|
subs w4, w4, #4
|
||
|
@@ -3546,6 +3572,8 @@ function ipred_z3_fill_padding_neon, export=0
|
||
|
mov w4, w12
|
||
|
b 1b
|
||
|
|
||
|
+40:
|
||
|
+ AARCH64_VALID_JUMP_TARGET
|
||
|
4:
|
||
|
st1 {v31.s}[0], [x0], x1
|
||
|
subs w4, w4, #4
|
||
|
@@ -3564,7 +3592,8 @@ function ipred_z3_fill_padding_neon, export=0
|
||
|
mov w4, w12
|
||
|
b 1b
|
||
|
|
||
|
-8:
|
||
|
+80:
|
||
|
+ AARCH64_VALID_JUMP_TARGET
|
||
|
st1 {v31.8b}, [x0], x1
|
||
|
subs w4, w4, #4
|
||
|
st1 {v31.8b}, [x13], x1
|
||
|
@@ -3582,9 +3611,10 @@ function ipred_z3_fill_padding_neon, export=0
|
||
|
mov w4, w12
|
||
|
b 1b
|
||
|
|
||
|
-16:
|
||
|
-32:
|
||
|
-64:
|
||
|
+160:
|
||
|
+320:
|
||
|
+640:
|
||
|
+ AARCH64_VALID_JUMP_TARGET
|
||
|
st1 {v31.16b}, [x0], x1
|
||
|
subs w4, w4, #4
|
||
|
st1 {v31.16b}, [x13], x1
|
||
|
@@ -3605,13 +3635,15 @@ function ipred_z3_fill_padding_neon, export=0
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_z3_fill_padding_tbl):
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 64b
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 32b
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 16b
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 8b
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 4b
|
||
|
- .hword L(ipred_z3_fill_padding_tbl) - 2b
|
||
|
+ .xword 640b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .xword 20b
|
||
|
+ .popsection
|
||
|
|
||
|
L(ipred_z3_fill_padding_wide):
|
||
|
// Fill a WxH rectangle with padding, with W > 16.
|
||
|
@@ -3766,13 +3798,13 @@ function ipred_filter_8bpc_neon, export=1
|
||
|
add x6, x6, w5, uxtw
|
||
|
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
|
||
|
clz w9, w3
|
||
|
- adr x5, L(ipred_filter_tbl)
|
||
|
+ adrp x5, L(ipred_filter_tbl)
|
||
|
+ add x5, x5, :lo12: L(ipred_filter_tbl)
|
||
|
ld1 {v20.8b, v21.8b, v22.8b}, [x6]
|
||
|
sub w9, w9, #26
|
||
|
- ldrh w9, [x5, w9, uxtw #1]
|
||
|
+ ldr x5, [x5, w9, uxtw #3]
|
||
|
sxtl v16.8h, v16.8b
|
||
|
sxtl v17.8h, v17.8b
|
||
|
- sub x5, x5, w9, uxtw
|
||
|
sxtl v18.8h, v18.8b
|
||
|
sxtl v19.8h, v19.8b
|
||
|
add x6, x0, x1
|
||
|
@@ -3913,11 +3945,13 @@ function ipred_filter_8bpc_neon, export=1
|
||
|
9:
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_filter_tbl):
|
||
|
- .hword L(ipred_filter_tbl) - 320b
|
||
|
- .hword L(ipred_filter_tbl) - 160b
|
||
|
- .hword L(ipred_filter_tbl) - 80b
|
||
|
- .hword L(ipred_filter_tbl) - 40b
|
||
|
+ .xword 320b
|
||
|
+ .xword 160b
|
||
|
+ .xword 80b
|
||
|
+ .xword 40b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -3926,11 +3960,11 @@ endfunc
|
||
|
function pal_pred_8bpc_neon, export=1
|
||
|
ld1 {v0.8h}, [x2]
|
||
|
clz w9, w4
|
||
|
- adr x6, L(pal_pred_tbl)
|
||
|
+ adrp x6, L(pal_pred_tbl)
|
||
|
+ add x6, x6, :lo12: L(pal_pred_tbl)
|
||
|
sub w9, w9, #25
|
||
|
- ldrh w9, [x6, w9, uxtw #1]
|
||
|
+ ldr x6, [x6, w9, uxtw #3]
|
||
|
xtn v0.8b, v0.8h
|
||
|
- sub x6, x6, w9, uxtw
|
||
|
add x2, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x6
|
||
|
@@ -4008,12 +4042,14 @@ function pal_pred_8bpc_neon, export=1
|
||
|
b.gt 64b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(pal_pred_tbl):
|
||
|
- .hword L(pal_pred_tbl) - 64b
|
||
|
- .hword L(pal_pred_tbl) - 32b
|
||
|
- .hword L(pal_pred_tbl) - 16b
|
||
|
- .hword L(pal_pred_tbl) - 8b
|
||
|
- .hword L(pal_pred_tbl) - 4b
|
||
|
+ .xword 64b
|
||
|
+ .xword 32b
|
||
|
+ .xword 16b
|
||
|
+ .xword 8b
|
||
|
+ .xword 4b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -4022,12 +4058,12 @@ endfunc
|
||
|
// const int16_t *ac, const int alpha);
|
||
|
function ipred_cfl_128_8bpc_neon, export=1
|
||
|
clz w9, w3
|
||
|
- adr x7, L(ipred_cfl_128_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_128_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_128_tbl)
|
||
|
sub w9, w9, #26
|
||
|
- ldrh w9, [x7, w9, uxtw #1]
|
||
|
+ ldr x7, [x7, w9, uxtw #3]
|
||
|
movi v0.8h, #128 // dc
|
||
|
dup v1.8h, w6 // alpha
|
||
|
- sub x7, x7, w9, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x7
|
||
|
@@ -4132,12 +4168,14 @@ L(ipred_cfl_splat_w16):
|
||
|
b.gt 1b
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_128_tbl):
|
||
|
L(ipred_cfl_splat_tbl):
|
||
|
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
|
||
|
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
|
||
|
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
|
||
|
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
|
||
|
+ .xword L(ipred_cfl_splat_w16)
|
||
|
+ .xword L(ipred_cfl_splat_w16)
|
||
|
+ .xword L(ipred_cfl_splat_w8)
|
||
|
+ .xword L(ipred_cfl_splat_w4)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -4146,12 +4184,12 @@ endfunc
|
||
|
// const int16_t *ac, const int alpha);
|
||
|
function ipred_cfl_top_8bpc_neon, export=1
|
||
|
clz w9, w3
|
||
|
- adr x7, L(ipred_cfl_top_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_top_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_top_tbl)
|
||
|
sub w9, w9, #26
|
||
|
- ldrh w9, [x7, w9, uxtw #1]
|
||
|
+ ldr x7, [x7, w9, uxtw #3]
|
||
|
dup v1.8h, w6 // alpha
|
||
|
add x2, x2, #1
|
||
|
- sub x7, x7, w9, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x7
|
||
|
@@ -4186,11 +4224,13 @@ function ipred_cfl_top_8bpc_neon, export=1
|
||
|
dup v0.8h, v2.h[0]
|
||
|
b L(ipred_cfl_splat_w16)
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_top_tbl):
|
||
|
- .hword L(ipred_cfl_top_tbl) - 32b
|
||
|
- .hword L(ipred_cfl_top_tbl) - 16b
|
||
|
- .hword L(ipred_cfl_top_tbl) - 8b
|
||
|
- .hword L(ipred_cfl_top_tbl) - 4b
|
||
|
+ .xword 32b
|
||
|
+ .xword 16b
|
||
|
+ .xword 8b
|
||
|
+ .xword 4b
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -4201,15 +4241,15 @@ function ipred_cfl_left_8bpc_neon, export=1
|
||
|
sub x2, x2, w4, uxtw
|
||
|
clz w9, w3
|
||
|
clz w8, w4
|
||
|
- adr x10, L(ipred_cfl_splat_tbl)
|
||
|
- adr x7, L(ipred_cfl_left_tbl)
|
||
|
+ adrp x10, L(ipred_cfl_splat_tbl)
|
||
|
+ add x10, x10, :lo12: L(ipred_cfl_splat_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_left_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_left_tbl)
|
||
|
sub w9, w9, #26
|
||
|
sub w8, w8, #26
|
||
|
- ldrh w9, [x10, w9, uxtw #1]
|
||
|
- ldrh w8, [x7, w8, uxtw #1]
|
||
|
+ ldr x9, [x10, w9, uxtw #3]
|
||
|
+ ldr x7, [x7, w8, uxtw #3]
|
||
|
dup v1.8h, w6 // alpha
|
||
|
- sub x9, x10, w9, uxtw
|
||
|
- sub x7, x7, w8, uxtw
|
||
|
add x6, x0, x1
|
||
|
lsl x1, x1, #1
|
||
|
br x7
|
||
|
@@ -4248,11 +4288,13 @@ L(ipred_cfl_left_h32):
|
||
|
dup v0.8h, v2.h[0]
|
||
|
br x9
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_left_tbl):
|
||
|
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
|
||
|
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
|
||
|
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
|
||
|
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
|
||
|
+ .xword L(ipred_cfl_left_h32)
|
||
|
+ .xword L(ipred_cfl_left_h16)
|
||
|
+ .xword L(ipred_cfl_left_h8)
|
||
|
+ .xword L(ipred_cfl_left_h4)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||
|
@@ -4266,16 +4308,15 @@ function ipred_cfl_8bpc_neon, export=1
|
||
|
clz w9, w3
|
||
|
clz w6, w4
|
||
|
dup v16.8h, w8 // width + height
|
||
|
- adr x7, L(ipred_cfl_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_tbl)
|
||
|
rbit w8, w8 // rbit(width + height)
|
||
|
sub w9, w9, #22 // 26 leading bits, minus table offset 4
|
||
|
sub w6, w6, #26
|
||
|
clz w8, w8 // ctz(width + height)
|
||
|
- ldrh w9, [x7, w9, uxtw #1]
|
||
|
- ldrh w6, [x7, w6, uxtw #1]
|
||
|
+ ldr x9, [x7, w9, uxtw #3]
|
||
|
+ ldr x7, [x7, w6, uxtw #3]
|
||
|
neg w8, w8 // -ctz(width + height)
|
||
|
- sub x9, x7, w9, uxtw
|
||
|
- sub x7, x7, w6, uxtw
|
||
|
ushr v16.8h, v16.8h, #1 // (width + height) >> 1
|
||
|
dup v17.8h, w8 // -ctz(width + height)
|
||
|
add x6, x0, x1
|
||
|
@@ -4392,15 +4433,17 @@ L(ipred_cfl_w32):
|
||
|
dup v0.8h, v0.h[0]
|
||
|
b L(ipred_cfl_splat_w16)
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_tbl):
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
|
||
|
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
|
||
|
+ .xword L(ipred_cfl_h32)
|
||
|
+ .xword L(ipred_cfl_h16)
|
||
|
+ .xword L(ipred_cfl_h8)
|
||
|
+ .xword L(ipred_cfl_h4)
|
||
|
+ .xword L(ipred_cfl_w32)
|
||
|
+ .xword L(ipred_cfl_w16)
|
||
|
+ .xword L(ipred_cfl_w8)
|
||
|
+ .xword L(ipred_cfl_w4)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
|
||
|
@@ -4409,14 +4452,14 @@ endfunc
|
||
|
function ipred_cfl_ac_420_8bpc_neon, export=1
|
||
|
clz w8, w5
|
||
|
lsl w4, w4, #2
|
||
|
- adr x7, L(ipred_cfl_ac_420_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_ac_420_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_420_tbl)
|
||
|
sub w8, w8, #27
|
||
|
- ldrh w8, [x7, w8, uxtw #1]
|
||
|
+ ldr x7, [x7, w8, uxtw #3]
|
||
|
movi v16.8h, #0
|
||
|
movi v17.8h, #0
|
||
|
movi v18.8h, #0
|
||
|
movi v19.8h, #0
|
||
|
- sub x7, x7, w8, uxtw
|
||
|
sub w8, w6, w4 // height - h_pad
|
||
|
rbit w9, w5 // rbit(width)
|
||
|
rbit w10, w6 // rbit(height)
|
||
|
@@ -4555,9 +4598,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc):
|
||
|
|
||
|
L(ipred_cfl_ac_420_w16):
|
||
|
AARCH64_VALID_JUMP_TARGET
|
||
|
- adr x7, L(ipred_cfl_ac_420_w16_tbl)
|
||
|
- ldrh w3, [x7, w3, uxtw #1]
|
||
|
- sub x7, x7, w3, uxtw
|
||
|
+ adrp x7, L(ipred_cfl_ac_420_w16_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_420_w16_tbl)
|
||
|
+ ldr x7, [x7, w3, uxtw #3]
|
||
|
br x7
|
||
|
|
||
|
L(ipred_cfl_ac_420_w16_wpad0):
|
||
|
@@ -4714,17 +4757,19 @@ L(ipred_cfl_ac_420_w16_hpad):
|
||
|
lsl w6, w6, #1
|
||
|
b L(ipred_cfl_ac_420_w8_calc_subtract_dc)
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_ac_420_tbl):
|
||
|
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
|
||
|
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
|
||
|
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
|
||
|
- .hword 0
|
||
|
+ .xword L(ipred_cfl_ac_420_w16)
|
||
|
+ .xword L(ipred_cfl_ac_420_w8)
|
||
|
+ .xword L(ipred_cfl_ac_420_w4)
|
||
|
+ .xword 0
|
||
|
|
||
|
L(ipred_cfl_ac_420_w16_tbl):
|
||
|
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
|
||
|
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
|
||
|
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
|
||
|
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
|
||
|
+ .xword L(ipred_cfl_ac_420_w16_wpad0)
|
||
|
+ .xword L(ipred_cfl_ac_420_w16_wpad1)
|
||
|
+ .xword L(ipred_cfl_ac_420_w16_wpad2)
|
||
|
+ .xword L(ipred_cfl_ac_420_w16_wpad3)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
|
||
|
@@ -4733,14 +4778,14 @@ endfunc
|
||
|
function ipred_cfl_ac_422_8bpc_neon, export=1
|
||
|
clz w8, w5
|
||
|
lsl w4, w4, #2
|
||
|
- adr x7, L(ipred_cfl_ac_422_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_ac_422_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_422_tbl)
|
||
|
sub w8, w8, #27
|
||
|
- ldrh w8, [x7, w8, uxtw #1]
|
||
|
+ ldr x7, [x7, w8, uxtw #3]
|
||
|
movi v16.8h, #0
|
||
|
movi v17.8h, #0
|
||
|
movi v18.8h, #0
|
||
|
movi v19.8h, #0
|
||
|
- sub x7, x7, w8, uxtw
|
||
|
sub w8, w6, w4 // height - h_pad
|
||
|
rbit w9, w5 // rbit(width)
|
||
|
rbit w10, w6 // rbit(height)
|
||
|
@@ -4831,9 +4876,9 @@ L(ipred_cfl_ac_422_w8_wpad):
|
||
|
|
||
|
L(ipred_cfl_ac_422_w16):
|
||
|
AARCH64_VALID_JUMP_TARGET
|
||
|
- adr x7, L(ipred_cfl_ac_422_w16_tbl)
|
||
|
- ldrh w3, [x7, w3, uxtw #1]
|
||
|
- sub x7, x7, w3, uxtw
|
||
|
+ adrp x7, L(ipred_cfl_ac_422_w16_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_422_w16_tbl)
|
||
|
+ ldr x7, [x7, w3, uxtw #3]
|
||
|
br x7
|
||
|
|
||
|
L(ipred_cfl_ac_422_w16_wpad0):
|
||
|
@@ -4936,17 +4981,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
|
||
|
mov v1.16b, v3.16b
|
||
|
b L(ipred_cfl_ac_420_w16_hpad)
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_ac_422_tbl):
|
||
|
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
|
||
|
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
|
||
|
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
|
||
|
- .hword 0
|
||
|
+ .xword L(ipred_cfl_ac_422_w16)
|
||
|
+ .xword L(ipred_cfl_ac_422_w8)
|
||
|
+ .xword L(ipred_cfl_ac_422_w4)
|
||
|
+ .xword 0
|
||
|
|
||
|
L(ipred_cfl_ac_422_w16_tbl):
|
||
|
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
|
||
|
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
|
||
|
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
|
||
|
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
|
||
|
+ .xword L(ipred_cfl_ac_422_w16_wpad0)
|
||
|
+ .xword L(ipred_cfl_ac_422_w16_wpad1)
|
||
|
+ .xword L(ipred_cfl_ac_422_w16_wpad2)
|
||
|
+ .xword L(ipred_cfl_ac_422_w16_wpad3)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
|
||
|
@@ -4955,14 +5002,14 @@ endfunc
|
||
|
function ipred_cfl_ac_444_8bpc_neon, export=1
|
||
|
clz w8, w5
|
||
|
lsl w4, w4, #2
|
||
|
- adr x7, L(ipred_cfl_ac_444_tbl)
|
||
|
+ adrp x7, L(ipred_cfl_ac_444_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_444_tbl)
|
||
|
sub w8, w8, #26
|
||
|
- ldrh w8, [x7, w8, uxtw #1]
|
||
|
+ ldr x7, [x7, w8, uxtw #3]
|
||
|
movi v16.8h, #0
|
||
|
movi v17.8h, #0
|
||
|
movi v18.8h, #0
|
||
|
movi v19.8h, #0
|
||
|
- sub x7, x7, w8, uxtw
|
||
|
sub w8, w6, w4 // height - h_pad
|
||
|
rbit w9, w5 // rbit(width)
|
||
|
rbit w10, w6 // rbit(height)
|
||
|
@@ -5083,9 +5130,10 @@ L(ipred_cfl_ac_444_w16_wpad):
|
||
|
|
||
|
L(ipred_cfl_ac_444_w32):
|
||
|
AARCH64_VALID_JUMP_TARGET
|
||
|
- adr x7, L(ipred_cfl_ac_444_w32_tbl)
|
||
|
- ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
|
||
|
- sub x7, x7, w3, uxtw
|
||
|
+ adrp x7, L(ipred_cfl_ac_444_w32_tbl)
|
||
|
+ add x7, x7, :lo12: L(ipred_cfl_ac_444_w32_tbl)
|
||
|
+ lsr w3, w3, #1
|
||
|
+ ldr x7, [x7, w3, uxtw #3] // (w3>>1) << 3
|
||
|
br x7
|
||
|
|
||
|
L(ipred_cfl_ac_444_w32_wpad0):
|
||
|
@@ -5231,15 +5279,17 @@ L(ipred_cfl_ac_444_w32_hpad):
|
||
|
dup v4.8h, v4.h[0]
|
||
|
b L(ipred_cfl_ac_420_w8_subtract_dc)
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(ipred_cfl_ac_444_tbl):
|
||
|
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
|
||
|
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
|
||
|
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
|
||
|
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
|
||
|
+ .xword L(ipred_cfl_ac_444_w32)
|
||
|
+ .xword L(ipred_cfl_ac_444_w16)
|
||
|
+ .xword L(ipred_cfl_ac_444_w8)
|
||
|
+ .xword L(ipred_cfl_ac_444_w4)
|
||
|
|
||
|
L(ipred_cfl_ac_444_w32_tbl):
|
||
|
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
|
||
|
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
|
||
|
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
|
||
|
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
|
||
|
+ .xword L(ipred_cfl_ac_444_w32_wpad0)
|
||
|
+ .xword L(ipred_cfl_ac_444_w32_wpad2)
|
||
|
+ .xword L(ipred_cfl_ac_444_w32_wpad4)
|
||
|
+ .xword L(ipred_cfl_ac_444_w32_wpad6)
|
||
|
+ .popsection
|
||
|
endfunc
|