ports/multimedia/dav1d/patches/patch-src_arm_64_ipred16_S

965 lines
36 KiB
Text

Index: src/arm/64/ipred16.S
--- src/arm/64/ipred16.S.orig
+++ src/arm/64/ipred16.S
@@ -36,11 +36,11 @@
function ipred_dc_128_16bpc_neon, export=1
ldr w8, [sp]
clz w3, w3
- adr x5, L(ipred_dc_128_tbl)
+ adrp x5, L(ipred_dc_128_tbl)
+ add x5, x5, :lo12: L(ipred_dc_128_tbl)
sub w3, w3, #25
- ldrh w3, [x5, w3, uxtw #1]
+ ldr x5, [x5, w3, uxtw #3]
dup v0.8h, w8
- sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
urshr v0.8h, v0.8h, #1
@@ -106,12 +106,14 @@ function ipred_dc_128_16bpc_neon, export=1
b.gt 64b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_dc_128_tbl):
- .hword L(ipred_dc_128_tbl) - 640b
- .hword L(ipred_dc_128_tbl) - 320b
- .hword L(ipred_dc_128_tbl) - 160b
- .hword L(ipred_dc_128_tbl) - 8b
- .hword L(ipred_dc_128_tbl) - 4b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 8b
+ .xword 4b
+ .popsection
endfunc
// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -120,11 +122,11 @@ endfunc
// const int max_width, const int max_height);
function ipred_v_16bpc_neon, export=1
clz w3, w3
- adr x5, L(ipred_v_tbl)
+ adrp x5, L(ipred_v_tbl)
+ add x5, x5, :lo12: L(ipred_v_tbl)
sub w3, w3, #25
- ldrh w3, [x5, w3, uxtw #1]
+ ldr x5, [x5, w3, uxtw #3]
add x2, x2, #2
- sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -190,12 +192,14 @@ function ipred_v_16bpc_neon, export=1
b.gt 64b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_v_tbl):
- .hword L(ipred_v_tbl) - 640b
- .hword L(ipred_v_tbl) - 320b
- .hword L(ipred_v_tbl) - 160b
- .hword L(ipred_v_tbl) - 80b
- .hword L(ipred_v_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -204,11 +208,11 @@ endfunc
// const int max_width, const int max_height);
function ipred_h_16bpc_neon, export=1
clz w3, w3
- adr x5, L(ipred_h_tbl)
+ adrp x5, L(ipred_h_tbl)
+ add x5, x5, :lo12: L(ipred_h_tbl)
sub w3, w3, #25
- ldrh w3, [x5, w3, uxtw #1]
+ ldr x5, [x5, w3, uxtw #3]
sub x2, x2, #8
- sub x5, x5, w3, uxtw
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
@@ -292,12 +296,14 @@ function ipred_h_16bpc_neon, export=1
b.gt 64b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_h_tbl):
- .hword L(ipred_h_tbl) - 64b
- .hword L(ipred_h_tbl) - 32b
- .hword L(ipred_h_tbl) - 16b
- .hword L(ipred_h_tbl) - 8b
- .hword L(ipred_h_tbl) - 4b
+ .xword 64b
+ .xword 32b
+ .xword 16b
+ .xword 8b
+ .xword 4b
+ .popsection
endfunc
// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -306,11 +312,11 @@ endfunc
// const int max_width, const int max_height);
function ipred_dc_top_16bpc_neon, export=1
clz w3, w3
- adr x5, L(ipred_dc_top_tbl)
+ adrp x5, L(ipred_dc_top_tbl)
+ add x5, x5, :lo12: L(ipred_dc_top_tbl)
sub w3, w3, #25
- ldrh w3, [x5, w3, uxtw #1]
+ ldr x5, [x5, w3, uxtw #3]
add x2, x2, #2
- sub x5, x5, w3, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -409,12 +415,14 @@ function ipred_dc_top_16bpc_neon, export=1
b.gt 64b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_dc_top_tbl):
- .hword L(ipred_dc_top_tbl) - 640b
- .hword L(ipred_dc_top_tbl) - 320b
- .hword L(ipred_dc_top_tbl) - 160b
- .hword L(ipred_dc_top_tbl) - 80b
- .hword L(ipred_dc_top_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -425,13 +433,12 @@ function ipred_dc_left_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
clz w3, w3
clz w7, w4
- adr x5, L(ipred_dc_left_tbl)
+ adrp x5, L(ipred_dc_left_tbl)
+ add x5, x5, :lo12: L(ipred_dc_left_tbl)
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w7, w7, #25
- ldrh w3, [x5, w3, uxtw #1]
- ldrh w7, [x5, w7, uxtw #1]
- sub x3, x5, w3, uxtw
- sub x5, x5, w7, uxtw
+ ldr x3, [x5, w3, uxtw #3]
+ ldr x5, [x5, w7, uxtw #3]
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -550,17 +557,19 @@ L(ipred_dc_left_w64):
b.gt 1b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_dc_left_tbl):
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
- .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+ .xword L(ipred_dc_left_h64)
+ .xword L(ipred_dc_left_h32)
+ .xword L(ipred_dc_left_h16)
+ .xword L(ipred_dc_left_h8)
+ .xword L(ipred_dc_left_h4)
+ .xword L(ipred_dc_left_w64)
+ .xword L(ipred_dc_left_w32)
+ .xword L(ipred_dc_left_w16)
+ .xword L(ipred_dc_left_w8)
+ .xword L(ipred_dc_left_w4)
+ .popsection
endfunc
// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -573,16 +582,15 @@ function ipred_dc_16bpc_neon, export=1
clz w3, w3
clz w6, w4
dup v16.4s, w7 // width + height
- adr x5, L(ipred_dc_tbl)
+ adrp x5, L(ipred_dc_tbl)
+ add x5, x5, :lo12: L(ipred_dc_tbl)
rbit w7, w7 // rbit(width + height)
sub w3, w3, #20 // 25 leading bits, minus table offset 5
sub w6, w6, #25
clz w7, w7 // ctz(width + height)
- ldrh w3, [x5, w3, uxtw #1]
- ldrh w6, [x5, w6, uxtw #1]
+ ldr x3, [x5, w3, uxtw #3]
+ ldr x5, [x5, w6, uxtw #3]
neg w7, w7 // -ctz(width + height)
- sub x3, x5, w3, uxtw
- sub x5, x5, w6, uxtw
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w7 // -ctz(width + height)
add x6, x0, x1
@@ -795,17 +803,19 @@ L(ipred_dc_w64):
b.gt 2b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_dc_tbl):
- .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
- .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
- .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
- .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
- .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
- .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
- .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
- .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
- .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
- .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+ .xword L(ipred_dc_h64)
+ .xword L(ipred_dc_h32)
+ .xword L(ipred_dc_h16)
+ .xword L(ipred_dc_h8)
+ .xword L(ipred_dc_h4)
+ .xword L(ipred_dc_w64)
+ .xword L(ipred_dc_w32)
+ .xword L(ipred_dc_w16)
+ .xword L(ipred_dc_w8)
+ .xword L(ipred_dc_w4)
+ .popsection
endfunc
// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -814,13 +824,13 @@ endfunc
// const int max_width, const int max_height);
function ipred_paeth_16bpc_neon, export=1
clz w9, w3
- adr x5, L(ipred_paeth_tbl)
+ adrp x5, L(ipred_paeth_tbl)
+ add x5, x5, :lo12: L(ipred_paeth_tbl)
sub w9, w9, #25
- ldrh w9, [x5, w9, uxtw #1]
+ ldr x5, [x5, w9, uxtw #3]
ld1r {v4.8h}, [x2]
add x8, x2, #2
sub x2, x2, #8
- sub x5, x5, w9, uxtw
mov x7, #-8
add x6, x0, x1
lsl x1, x1, #1
@@ -934,12 +944,14 @@ function ipred_paeth_16bpc_neon, export=1
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_paeth_tbl):
- .hword L(ipred_paeth_tbl) - 640b
- .hword L(ipred_paeth_tbl) - 320b
- .hword L(ipred_paeth_tbl) - 160b
- .hword L(ipred_paeth_tbl) - 80b
- .hword L(ipred_paeth_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -951,13 +963,13 @@ function ipred_smooth_16bpc_neon, export=1
add x11, x10, w4, uxtw
add x10, x10, w3, uxtw
clz w9, w3
- adr x5, L(ipred_smooth_tbl)
+ adrp x5, L(ipred_smooth_tbl)
+ add x5, x5, :lo12: L(ipred_smooth_tbl)
sub x12, x2, w4, uxtw #1
sub w9, w9, #25
- ldrh w9, [x5, w9, uxtw #1]
+ ldr x5, [x5, w9, uxtw #3]
ld1r {v4.8h}, [x12] // bottom
add x8, x2, #2
- sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -1138,12 +1150,14 @@ function ipred_smooth_16bpc_neon, export=1
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_smooth_tbl):
- .hword L(ipred_smooth_tbl) - 640b
- .hword L(ipred_smooth_tbl) - 320b
- .hword L(ipred_smooth_tbl) - 160b
- .hword L(ipred_smooth_tbl) - 80b
- .hword L(ipred_smooth_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -1154,13 +1168,13 @@ function ipred_smooth_v_16bpc_neon, export=1
movrel x7, X(sm_weights)
add x7, x7, w4, uxtw
clz w9, w3
- adr x5, L(ipred_smooth_v_tbl)
+ adrp x5, L(ipred_smooth_v_tbl)
+ add x5, x5, :lo12: L(ipred_smooth_v_tbl)
sub x8, x2, w4, uxtw #1
sub w9, w9, #25
- ldrh w9, [x5, w9, uxtw #1]
+ ldr x5, [x5, w9, uxtw #3]
ld1r {v4.8h}, [x8] // bottom
add x2, x2, #2
- sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -1265,12 +1279,14 @@ function ipred_smooth_v_16bpc_neon, export=1
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_smooth_v_tbl):
- .hword L(ipred_smooth_v_tbl) - 640b
- .hword L(ipred_smooth_v_tbl) - 320b
- .hword L(ipred_smooth_v_tbl) - 160b
- .hword L(ipred_smooth_v_tbl) - 80b
- .hword L(ipred_smooth_v_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -1281,12 +1297,12 @@ function ipred_smooth_h_16bpc_neon, export=1
movrel x8, X(sm_weights)
add x8, x8, w3, uxtw
clz w9, w3
- adr x5, L(ipred_smooth_h_tbl)
+ adrp x5, L(ipred_smooth_h_tbl)
+ add x5, x5, :lo12: L(ipred_smooth_h_tbl)
add x12, x2, w3, uxtw #1
sub w9, w9, #25
- ldrh w9, [x5, w9, uxtw #1]
+ ldr x5, [x5, w9, uxtw #3]
ld1r {v5.8h}, [x12] // right
- sub x5, x5, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
br x5
@@ -1397,12 +1413,14 @@ function ipred_smooth_h_16bpc_neon, export=1
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_smooth_h_tbl):
- .hword L(ipred_smooth_h_tbl) - 640b
- .hword L(ipred_smooth_h_tbl) - 320b
- .hword L(ipred_smooth_h_tbl) - 160b
- .hword L(ipred_smooth_h_tbl) - 80b
- .hword L(ipred_smooth_h_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
const padding_mask_buf
@@ -1728,11 +1746,11 @@ endfunc
// const int dx, const int max_base_x);
function ipred_z1_fill1_16bpc_neon, export=1
clz w9, w3
- adr x8, L(ipred_z1_fill1_tbl)
+ adrp x8, L(ipred_z1_fill1_tbl)
+ add x8, x8, :lo12: L(ipred_z1_fill1_tbl)
sub w9, w9, #25
- ldrh w9, [x8, w9, uxtw #1]
+ ldr x8, [x8, w9, uxtw #3]
add x10, x2, w6, uxtw #1 // top[max_base_x]
- sub x8, x8, w9, uxtw
ld1r {v31.8h}, [x10] // padding
mov w7, w5
mov w15, #64
@@ -1917,12 +1935,14 @@ function ipred_z1_fill1_16bpc_neon, export=1
mov w3, w12
b 169b
+ .pushsection .data.rel.ro, "aw"
L(ipred_z1_fill1_tbl):
- .hword L(ipred_z1_fill1_tbl) - 640b
- .hword L(ipred_z1_fill1_tbl) - 320b
- .hword L(ipred_z1_fill1_tbl) - 160b
- .hword L(ipred_z1_fill1_tbl) - 80b
- .hword L(ipred_z1_fill1_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
function ipred_z1_fill2_16bpc_neon, export=1
@@ -2050,11 +2070,11 @@ endconst
// const int dx, const int dy);
function ipred_z2_fill1_16bpc_neon, export=1
clz w10, w4
- adr x9, L(ipred_z2_fill1_tbl)
+ adrp x9, L(ipred_z2_fill1_tbl)
+ add x9, x9, :lo12: L(ipred_z2_fill1_tbl)
sub w10, w10, #25
- ldrh w10, [x9, w10, uxtw #1]
+ ldr x9, [x9, w10, uxtw #3]
mov w8, #(1 << 6) // xpos = 1 << 6
- sub x9, x9, w10, uxtw
sub w8, w8, w6 // xpos -= dx
movrel x11, increments
@@ -2815,12 +2835,14 @@ function ipred_z2_fill1_16bpc_neon, export=1
ldp d8, d9, [sp], 0x40
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_z2_fill1_tbl):
- .hword L(ipred_z2_fill1_tbl) - 640b
- .hword L(ipred_z2_fill1_tbl) - 320b
- .hword L(ipred_z2_fill1_tbl) - 160b
- .hword L(ipred_z2_fill1_tbl) - 80b
- .hword L(ipred_z2_fill1_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
function ipred_z2_fill2_16bpc_neon, export=1
@@ -3432,11 +3454,11 @@ endfunc
// const int dy, const int max_base_y);
function ipred_z3_fill1_16bpc_neon, export=1
clz w9, w4
- adr x8, L(ipred_z3_fill1_tbl)
+ adrp x8, L(ipred_z3_fill1_tbl)
+ add x8, x8, :lo12: L(ipred_z3_fill1_tbl)
sub w9, w9, #25
- ldrh w9, [x8, w9, uxtw #1]
+ ldr x8, [x8, w9, uxtw #3]
add x10, x2, w6, uxtw #1 // left[max_base_y]
- sub x8, x8, w9, uxtw
ld1r {v31.8h}, [x10] // padding
mov w7, w5
mov w15, #64
@@ -3638,17 +3660,20 @@ function ipred_z3_fill1_16bpc_neon, export=1
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_z3_fill1_tbl):
- .hword L(ipred_z3_fill1_tbl) - 640b
- .hword L(ipred_z3_fill1_tbl) - 320b
- .hword L(ipred_z3_fill1_tbl) - 160b
- .hword L(ipred_z3_fill1_tbl) - 80b
- .hword L(ipred_z3_fill1_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
function ipred_z3_fill_padding_neon, export=0
cmp w3, #8
- adr x8, L(ipred_z3_fill_padding_tbl)
+ adrp x8, L(ipred_z3_fill_padding_tbl)
+ add x8, x8, :lo12: L(ipred_z3_fill_padding_tbl)
b.gt L(ipred_z3_fill_padding_wide)
// w3 = remaining width, w4 = constant height
mov w12, w4
@@ -3659,10 +3684,11 @@ function ipred_z3_fill_padding_neon, export=0
// power of two in the remaining width, and repeating.
clz w9, w3
sub w9, w9, #25
- ldrh w9, [x8, w9, uxtw #1]
- sub x9, x8, w9, uxtw
+ ldr x9, [x8, w9, uxtw #3]
br x9
+20:
+ AARCH64_VALID_JUMP_TARGET
2:
st1 {v31.s}[0], [x0], x1
subs w4, w4, #4
@@ -3681,6 +3707,8 @@ function ipred_z3_fill_padding_neon, export=0
mov w4, w12
b 1b
+40:
+ AARCH64_VALID_JUMP_TARGET
4:
st1 {v31.4h}, [x0], x1
subs w4, w4, #4
@@ -3699,10 +3727,11 @@ function ipred_z3_fill_padding_neon, export=0
mov w4, w12
b 1b
-8:
-16:
-32:
-64:
+80:
+160:
+320:
+640:
+ AARCH64_VALID_JUMP_TARGET
st1 {v31.8h}, [x0], x1
subs w4, w4, #4
st1 {v31.8h}, [x13], x1
@@ -3723,13 +3752,15 @@ function ipred_z3_fill_padding_neon, export=0
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_z3_fill_padding_tbl):
- .hword L(ipred_z3_fill_padding_tbl) - 64b
- .hword L(ipred_z3_fill_padding_tbl) - 32b
- .hword L(ipred_z3_fill_padding_tbl) - 16b
- .hword L(ipred_z3_fill_padding_tbl) - 8b
- .hword L(ipred_z3_fill_padding_tbl) - 4b
- .hword L(ipred_z3_fill_padding_tbl) - 2b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .popsection
L(ipred_z3_fill_padding_wide):
// Fill a WxH rectangle with padding, with W > 8.
@@ -3880,13 +3911,13 @@ function ipred_filter_\bpc\()bpc_neon
add x6, x6, w5, uxtw
ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
clz w9, w3
- adr x5, L(ipred_filter\bpc\()_tbl)
+ adrp x5, L(ipred_filter\bpc\()_tbl)
+ add x5, x5, :lo12: L(ipred_filter\bpc\()_tbl)
ld1 {v20.8b, v21.8b, v22.8b}, [x6]
sub w9, w9, #26
- ldrh w9, [x5, w9, uxtw #1]
+ ldr x5, [x5, w9, uxtw #3]
sxtl v16.8h, v16.8b
sxtl v17.8h, v17.8b
- sub x5, x5, w9, uxtw
sxtl v18.8h, v18.8b
sxtl v19.8h, v19.8b
add x6, x0, x1
@@ -4160,11 +4191,13 @@ function ipred_filter_\bpc\()bpc_neon
9:
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_filter\bpc\()_tbl):
- .hword L(ipred_filter\bpc\()_tbl) - 320b
- .hword L(ipred_filter\bpc\()_tbl) - 160b
- .hword L(ipred_filter\bpc\()_tbl) - 80b
- .hword L(ipred_filter\bpc\()_tbl) - 40b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
.endm
@@ -4184,11 +4217,11 @@ endfunc
function pal_pred_16bpc_neon, export=1
ld1 {v30.8h}, [x2]
clz w9, w4
- adr x6, L(pal_pred_tbl)
+ adrp x6, L(pal_pred_tbl)
+ add x6, x6, :lo12: L(pal_pred_tbl)
sub w9, w9, #25
- ldrh w9, [x6, w9, uxtw #1]
+ ldr x6, [x6, w9, uxtw #3]
movi v31.8h, #1, lsl #8
- sub x6, x6, w9, uxtw
br x6
40:
AARCH64_VALID_JUMP_TARGET
@@ -4357,12 +4390,14 @@ function pal_pred_16bpc_neon, export=1
b.gt 64b
ret
+ .pushsection .data.rel.ro, "aw"
L(pal_pred_tbl):
- .hword L(pal_pred_tbl) - 640b
- .hword L(pal_pred_tbl) - 320b
- .hword L(pal_pred_tbl) - 160b
- .hword L(pal_pred_tbl) - 80b
- .hword L(pal_pred_tbl) - 40b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4373,12 +4408,12 @@ endfunc
function ipred_cfl_128_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
- adr x7, L(ipred_cfl_128_tbl)
+ adrp x7, L(ipred_cfl_128_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_128_tbl)
sub w9, w9, #26
- ldrh w9, [x7, w9, uxtw #1]
+ ldr x7, [x7, w9, uxtw #3]
urshr v0.8h, v31.8h, #1
dup v1.8h, w6 // alpha
- sub x7, x7, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
@@ -4510,12 +4545,14 @@ L(ipred_cfl_splat_w16):
b.gt 1b
ret
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_128_tbl):
L(ipred_cfl_splat_tbl):
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
- .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+ .xword L(ipred_cfl_splat_w16)
+ .xword L(ipred_cfl_splat_w16)
+ .xword L(ipred_cfl_splat_w8)
+ .xword L(ipred_cfl_splat_w4)
+ .popsection
endfunc
// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4526,12 +4563,12 @@ endfunc
function ipred_cfl_top_16bpc_neon, export=1
dup v31.8h, w7 // bitdepth_max
clz w9, w3
- adr x7, L(ipred_cfl_top_tbl)
+ adrp x7, L(ipred_cfl_top_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_top_tbl)
sub w9, w9, #26
- ldrh w9, [x7, w9, uxtw #1]
+ ldr x7, [x7, w9, uxtw #3]
dup v1.8h, w6 // alpha
add x2, x2, #2
- sub x7, x7, w9, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
@@ -4569,11 +4606,13 @@ function ipred_cfl_top_16bpc_neon, export=1
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_top_tbl):
- .hword L(ipred_cfl_top_tbl) - 32b
- .hword L(ipred_cfl_top_tbl) - 16b
- .hword L(ipred_cfl_top_tbl) - 8b
- .hword L(ipred_cfl_top_tbl) - 4b
+ .xword 32b
+ .xword 16b
+ .xword 8b
+ .xword 4b
+ .popsection
endfunc
// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4586,15 +4625,15 @@ function ipred_cfl_left_16bpc_neon, export=1
sub x2, x2, w4, uxtw #1
clz w9, w3
clz w8, w4
- adr x10, L(ipred_cfl_splat_tbl)
- adr x7, L(ipred_cfl_left_tbl)
+ adrp x10, L(ipred_cfl_splat_tbl)
+ add x10, x10, :lo12: L(ipred_cfl_splat_tbl)
+ adrp x7, L(ipred_cfl_left_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_left_tbl)
sub w9, w9, #26
sub w8, w8, #26
- ldrh w9, [x10, w9, uxtw #1]
- ldrh w8, [x7, w8, uxtw #1]
+ ldr x9, [x10, w9, uxtw #3]
+ ldr x7, [x7, w8, uxtw #3]
dup v1.8h, w6 // alpha
- sub x9, x10, w9, uxtw
- sub x7, x7, w8, uxtw
add x6, x0, x1
lsl x1, x1, #1
movi v30.8h, #0
@@ -4636,11 +4675,13 @@ L(ipred_cfl_left_h32):
dup v0.8h, v0.h[0]
br x9
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_left_tbl):
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
- .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+ .xword L(ipred_cfl_left_h32)
+ .xword L(ipred_cfl_left_h16)
+ .xword L(ipred_cfl_left_h8)
+ .xword L(ipred_cfl_left_h4)
+ .popsection
endfunc
// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4656,16 +4697,15 @@ function ipred_cfl_16bpc_neon, export=1
clz w9, w3
clz w6, w4
dup v16.4s, w8 // width + height
- adr x7, L(ipred_cfl_tbl)
+ adrp x7, L(ipred_cfl_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_tbl)
rbit w8, w8 // rbit(width + height)
sub w9, w9, #22 // 26 leading bits, minus table offset 4
sub w6, w6, #26
clz w8, w8 // ctz(width + height)
- ldrh w9, [x7, w9, uxtw #1]
- ldrh w6, [x7, w6, uxtw #1]
+ ldr x9, [x7, w9, uxtw #3]
+ ldr x7, [x7, w6, uxtw #3]
neg w8, w8 // -ctz(width + height)
- sub x9, x7, w9, uxtw
- sub x7, x7, w6, uxtw
ushr v16.4s, v16.4s, #1 // (width + height) >> 1
dup v17.4s, w8 // -ctz(width + height)
add x6, x0, x1
@@ -4789,15 +4829,17 @@ L(ipred_cfl_w32):
dup v0.8h, v0.h[0]
b L(ipred_cfl_splat_w16)
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_tbl):
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
- .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+ .xword L(ipred_cfl_h32)
+ .xword L(ipred_cfl_h16)
+ .xword L(ipred_cfl_h8)
+ .xword L(ipred_cfl_h4)
+ .xword L(ipred_cfl_w32)
+ .xword L(ipred_cfl_w16)
+ .xword L(ipred_cfl_w8)
+ .xword L(ipred_cfl_w4)
+ .popsection
endfunc
// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -4806,14 +4848,14 @@ endfunc
function ipred_cfl_ac_420_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
- adr x7, L(ipred_cfl_ac_420_tbl)
+ adrp x7, L(ipred_cfl_ac_420_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_420_tbl)
sub w8, w8, #27
- ldrh w8, [x7, w8, uxtw #1]
+ ldr x7, [x7, w8, uxtw #3]
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
- sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
@@ -4945,9 +4987,9 @@ L(ipred_cfl_ac_420_w8_hpad):
L(ipred_cfl_ac_420_w16):
AARCH64_VALID_JUMP_TARGET
- adr x7, L(ipred_cfl_ac_420_w16_tbl)
- ldrh w3, [x7, w3, uxtw #1]
- sub x7, x7, w3, uxtw
+ adrp x7, L(ipred_cfl_ac_420_w16_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_420_w16_tbl)
+ ldr x7, [x7, w3, uxtw #3]
br x7
L(ipred_cfl_ac_420_w16_wpad0):
@@ -5124,17 +5166,19 @@ L(ipred_cfl_ac_420_w16_hpad):
lsl w6, w6, #2
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_ac_420_tbl):
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
- .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
- .hword 0
+ .xword L(ipred_cfl_ac_420_w16)
+ .xword L(ipred_cfl_ac_420_w8)
+ .xword L(ipred_cfl_ac_420_w4)
+ .xword 0
L(ipred_cfl_ac_420_w16_tbl):
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
- .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+ .xword L(ipred_cfl_ac_420_w16_wpad0)
+ .xword L(ipred_cfl_ac_420_w16_wpad1)
+ .xword L(ipred_cfl_ac_420_w16_wpad2)
+ .xword L(ipred_cfl_ac_420_w16_wpad3)
+ .popsection
endfunc
// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -5143,14 +5187,14 @@ endfunc
function ipred_cfl_ac_422_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
- adr x7, L(ipred_cfl_ac_422_tbl)
+ adrp x7, L(ipred_cfl_ac_422_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_422_tbl)
sub w8, w8, #27
- ldrh w8, [x7, w8, uxtw #1]
+ ldr x7, [x7, w8, uxtw #3]
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
- sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
@@ -5251,9 +5295,9 @@ L(ipred_cfl_ac_422_w8_wpad):
L(ipred_cfl_ac_422_w16):
AARCH64_VALID_JUMP_TARGET
- adr x7, L(ipred_cfl_ac_422_w16_tbl)
- ldrh w3, [x7, w3, uxtw #1]
- sub x7, x7, w3, uxtw
+ adrp x7, L(ipred_cfl_ac_422_w16_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_422_w16_tbl)
+ ldr x7, [x7, w3, uxtw #3]
br x7
L(ipred_cfl_ac_422_w16_wpad0):
@@ -5372,17 +5416,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
mov v1.16b, v3.16b
b L(ipred_cfl_ac_420_w16_hpad)
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_ac_422_tbl):
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
- .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
- .hword 0
+ .xword L(ipred_cfl_ac_422_w16)
+ .xword L(ipred_cfl_ac_422_w8)
+ .xword L(ipred_cfl_ac_422_w4)
+ .xword 0
L(ipred_cfl_ac_422_w16_tbl):
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
- .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+ .xword L(ipred_cfl_ac_422_w16_wpad0)
+ .xword L(ipred_cfl_ac_422_w16_wpad1)
+ .xword L(ipred_cfl_ac_422_w16_wpad2)
+ .xword L(ipred_cfl_ac_422_w16_wpad3)
+ .popsection
endfunc
// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -5391,14 +5437,14 @@ endfunc
function ipred_cfl_ac_444_16bpc_neon, export=1
clz w8, w5
lsl w4, w4, #2
- adr x7, L(ipred_cfl_ac_444_tbl)
+ adrp x7, L(ipred_cfl_ac_444_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_444_tbl)
sub w8, w8, #26
- ldrh w8, [x7, w8, uxtw #1]
+ ldr x7, [x7, w8, uxtw #3]
movi v24.4s, #0
movi v25.4s, #0
movi v26.4s, #0
movi v27.4s, #0
- sub x7, x7, w8, uxtw
sub w8, w6, w4 // height - h_pad
rbit w9, w5 // rbit(width)
rbit w10, w6 // rbit(height)
@@ -5507,10 +5553,11 @@ L(ipred_cfl_ac_444_w16_wpad):
L(ipred_cfl_ac_444_w32):
AARCH64_VALID_JUMP_TARGET
- adr x7, L(ipred_cfl_ac_444_w32_tbl)
- ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1
+ adrp x7, L(ipred_cfl_ac_444_w32_tbl)
+ add x7, x7, :lo12: L(ipred_cfl_ac_444_w32_tbl)
+ lsr w3, w3, #1
+ ldr x7, [x7, w3, uxtw #3] // (w3>>1) << 3
lsr x2, x2, #1 // Restore the stride to one line increments
- sub x7, x7, w3, uxtw
br x7
L(ipred_cfl_ac_444_w32_wpad0):
@@ -5625,15 +5672,17 @@ L(ipred_cfl_ac_444_w32_hpad):
lsl w6, w6, #3
b L(ipred_cfl_ac_420_w4_calc_subtract_dc)
+ .pushsection .data.rel.ro, "aw"
L(ipred_cfl_ac_444_tbl):
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
- .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+ .xword L(ipred_cfl_ac_444_w32)
+ .xword L(ipred_cfl_ac_444_w16)
+ .xword L(ipred_cfl_ac_444_w8)
+ .xword L(ipred_cfl_ac_444_w4)
L(ipred_cfl_ac_444_w32_tbl):
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
- .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+ .xword L(ipred_cfl_ac_444_w32_wpad0)
+ .xword L(ipred_cfl_ac_444_w32_wpad2)
+ .xword L(ipred_cfl_ac_444_w32_wpad4)
+ .xword L(ipred_cfl_ac_444_w32_wpad6)
+ .popsection
endfunc