ports/multimedia/dav1d/patches/patch-src_arm_64_mc16_S

523 lines
17 KiB
Text

Index: src/arm/64/mc16.S
--- src/arm/64/mc16.S.orig
+++ src/arm/64/mc16.S
@@ -145,11 +145,11 @@ function \type\()_16bpc_neon, export=1
dup v27.4s, w6
neg v27.4s, v27.4s
.endif
- adr x7, L(\type\()_tbl)
+ adrp x7, L(\type\()_tbl)
+ add x7, x7, :lo12: L(\type\()_tbl)
sub w4, w4, #24
\type v4, v5, v0, v1, v2, v3
- ldrh w4, [x7, x4, lsl #1]
- sub x7, x7, w4, uxtw
+ ldr x7, [x7, x4, lsl #3]
br x7
40:
AARCH64_VALID_JUMP_TARGET
@@ -228,13 +228,15 @@ function \type\()_16bpc_neon, export=1
b 128b
0:
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_tbl):
- .hword L(\type\()_tbl) - 1280b
- .hword L(\type\()_tbl) - 640b
- .hword L(\type\()_tbl) - 32b
- .hword L(\type\()_tbl) - 16b
- .hword L(\type\()_tbl) - 80b
- .hword L(\type\()_tbl) - 40b
+ .xword 1280b
+ .xword 640b
+ .xword 32b
+ .xword 16b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
.endm
@@ -247,12 +249,12 @@ bidir_fn mask, w7
function w_mask_\type\()_16bpc_neon, export=1
ldr w8, [sp]
clz w9, w4
- adr x10, L(w_mask_\type\()_tbl)
+ adrp x10, L(w_mask_\type\()_tbl)
+ add x10, x10, :lo12: L(w_mask_\type\()_tbl)
dup v31.8h, w8 // bitdepth_max
sub w9, w9, #24
clz w8, w8 // clz(bitdepth_max)
- ldrh w9, [x10, x9, lsl #1]
- sub x10, x10, w9, uxtw
+ ldr x10, [x10, x9, lsl #3]
sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
mov w9, #PREP_BIAS*64
neg w8, w8 // -sh
@@ -541,13 +543,15 @@ function w_mask_\type\()_16bpc_neon, export=1
add x12, x12, x1
b.gt 161b
ret
+ .pushsection .data.rel.ro, "aw"
L(w_mask_\type\()_tbl):
- .hword L(w_mask_\type\()_tbl) - 1280b
- .hword L(w_mask_\type\()_tbl) - 640b
- .hword L(w_mask_\type\()_tbl) - 320b
- .hword L(w_mask_\type\()_tbl) - 160b
- .hword L(w_mask_\type\()_tbl) - 8b
- .hword L(w_mask_\type\()_tbl) - 4b
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 8b
+ .xword 4b
+ .popsection
endfunc
.endm
@@ -557,11 +561,11 @@ w_mask_fn 420
function blend_16bpc_neon, export=1
- adr x6, L(blend_tbl)
+ adrp x6, L(blend_tbl)
+ add x6, x6, :lo12: L(blend_tbl)
clz w3, w3
sub w3, w3, #26
- ldrh w3, [x6, x3, lsl #1]
- sub x6, x6, w3, uxtw
+ ldr x6, [x6, x3, lsl #3]
add x8, x0, x1
br x6
40:
@@ -673,15 +677,18 @@ function blend_16bpc_neon, export=1
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
b.gt 32b
ret
+ .pushsection .data.rel.ro, "aw"
L(blend_tbl):
- .hword L(blend_tbl) - 32b
- .hword L(blend_tbl) - 160b
- .hword L(blend_tbl) - 80b
- .hword L(blend_tbl) - 40b
+ .xword 32b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
function blend_h_16bpc_neon, export=1
- adr x6, L(blend_h_tbl)
+ adrp x6, L(blend_h_tbl)
+ add x6, x6, :lo12: L(blend_h_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w4, uxtw
sub w4, w4, w4, lsr #2
@@ -689,8 +696,7 @@ function blend_h_16bpc_neon, export=1
add x8, x0, x1
lsl x1, x1, #1
sub w7, w7, #24
- ldrh w7, [x6, x7, lsl #1]
- sub x6, x6, w7, uxtw
+ ldr x6, [x6, x7, lsl #3]
br x6
2:
AARCH64_VALID_JUMP_TARGET
@@ -835,26 +841,28 @@ function blend_h_16bpc_neon, export=1
add x7, x7, w3, uxtw #1
b.gt 321b
ret
+ .pushsection .data.rel.ro, "aw"
L(blend_h_tbl):
- .hword L(blend_h_tbl) - 1280b
- .hword L(blend_h_tbl) - 640b
- .hword L(blend_h_tbl) - 320b
- .hword L(blend_h_tbl) - 16b
- .hword L(blend_h_tbl) - 8b
- .hword L(blend_h_tbl) - 4b
- .hword L(blend_h_tbl) - 2b
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 16b
+ .xword 8b
+ .xword 4b
+ .xword 2b
+ .popsection
endfunc
function blend_v_16bpc_neon, export=1
- adr x6, L(blend_v_tbl)
+ adrp x6, L(blend_v_tbl)
+ add x6, x6, :lo12: L(blend_v_tbl)
movrel x5, X(obmc_masks)
add x5, x5, w3, uxtw
clz w3, w3
add x8, x0, x1
lsl x1, x1, #1
sub w3, w3, #26
- ldrh w3, [x6, x3, lsl #1]
- sub x6, x6, w3, uxtw
+ ldr x6, [x6, x3, lsl #3]
br x6
20:
AARCH64_VALID_JUMP_TARGET
@@ -992,21 +1000,23 @@ function blend_v_16bpc_neon, export=1
st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
b.gt 32b
ret
+ .pushsection .data.rel.ro, "aw"
L(blend_v_tbl):
- .hword L(blend_v_tbl) - 320b
- .hword L(blend_v_tbl) - 160b
- .hword L(blend_v_tbl) - 80b
- .hword L(blend_v_tbl) - 40b
- .hword L(blend_v_tbl) - 20b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .popsection
endfunc
// This has got the same signature as the put_8tap functions,
// and assumes that x9 is set to (clz(w)-24).
function put_neon
- adr x10, L(put_tbl)
- ldrh w9, [x10, x9, lsl #1]
- sub x10, x10, w9, uxtw
+ adrp x10, L(put_tbl)
+ add x10, x10, :lo12: L(put_tbl)
+ ldr x10, [x10, x9, lsl #3]
br x10
2:
@@ -1106,14 +1116,16 @@ function put_neon
b.gt 128b
ret
+ .pushsection .data.rel.ro, "aw"
L(put_tbl):
- .hword L(put_tbl) - 128b
- .hword L(put_tbl) - 64b
- .hword L(put_tbl) - 32b
- .hword L(put_tbl) - 16b
- .hword L(put_tbl) - 80b
- .hword L(put_tbl) - 4b
- .hword L(put_tbl) - 2b
+ .xword 128b
+ .xword 64b
+ .xword 32b
+ .xword 16b
+ .xword 80b
+ .xword 4b
+ .xword 2b
+ .popsection
endfunc
@@ -1121,11 +1133,11 @@ endfunc
// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
// x8 to w*2.
function prep_neon
- adr x10, L(prep_tbl)
- ldrh w9, [x10, x9, lsl #1]
+ adrp x10, L(prep_tbl)
+ add x10, x10, :lo12: L(prep_tbl)
+ ldr x10, [x10, x9, lsl #3]
dup v31.8h, w7 // intermediate_bits
movi v30.8h, #(PREP_BIAS >> 8), lsl #8
- sub x10, x10, w9, uxtw
br x10
40:
@@ -1278,13 +1290,15 @@ function prep_neon
b.gt 128b
ret
+ .pushsection .data.rel.ro, "aw"
L(prep_tbl):
- .hword L(prep_tbl) - 128b
- .hword L(prep_tbl) - 64b
- .hword L(prep_tbl) - 32b
- .hword L(prep_tbl) - 16b
- .hword L(prep_tbl) - 80b
- .hword L(prep_tbl) - 40b
+ .xword 128b
+ .xword 64b
+ .xword 32b
+ .xword 16b
+ .xword 80b
+ .xword 40b
+ .popsection
endfunc
@@ -1563,16 +1577,16 @@ L(\type\()_8tap_h):
add \xmx, x11, \mx, uxtw #3
b.ne L(\type\()_8tap_hv)
- adr x10, L(\type\()_8tap_h_tbl)
+ adrp x10, L(\type\()_8tap_h_tbl)
+ add x10, x10, :lo12: L(\type\()_8tap_h_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
- ldrh w9, [x10, x9, lsl #1]
+ ldr x10, [x10, x9, lsl #3]
neg v30.4s, v30.4s // -(6-intermediate_bits)
.ifc \type, put
dup v29.8h, \bdmax // intermediate_bits
.else
movi v28.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- sub x10, x10, w9, uxtw
.ifc \type, put
neg v29.8h, v29.8h // -intermediate_bits
.endif
@@ -1734,15 +1748,17 @@ L(\type\()_8tap_h):
b.gt 81b
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_8tap_h_tbl):
- .hword L(\type\()_8tap_h_tbl) - 1280b
- .hword L(\type\()_8tap_h_tbl) - 640b
- .hword L(\type\()_8tap_h_tbl) - 320b
- .hword L(\type\()_8tap_h_tbl) - 160b
- .hword L(\type\()_8tap_h_tbl) - 80b
- .hword L(\type\()_8tap_h_tbl) - 40b
- .hword L(\type\()_8tap_h_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
L(\type\()_8tap_v):
@@ -1758,12 +1774,12 @@ L(\type\()_8tap_v):
dup v30.4s, w12 // 6 - intermediate_bits
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- adr x10, L(\type\()_8tap_v_tbl)
- ldrh w9, [x10, x9, lsl #1]
+ adrp x10, L(\type\()_8tap_v_tbl)
+ add x10, x10, :lo12: L(\type\()_8tap_v_tbl)
+ ldr x10, [x10, x9, lsl #3]
.ifc \type, prep
neg v30.4s, v30.4s // -(6-intermediate_bits)
.endif
- sub x10, x10, w9, uxtw
br x10
20: // 2xN v
@@ -2029,15 +2045,17 @@ L(\type\()_8tap_v):
0:
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_8tap_v_tbl):
- .hword L(\type\()_8tap_v_tbl) - 1280b
- .hword L(\type\()_8tap_v_tbl) - 640b
- .hword L(\type\()_8tap_v_tbl) - 320b
- .hword L(\type\()_8tap_v_tbl) - 160b
- .hword L(\type\()_8tap_v_tbl) - 80b
- .hword L(\type\()_8tap_v_tbl) - 40b
- .hword L(\type\()_8tap_v_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
L(\type\()_8tap_hv):
cmp \h, #4
@@ -2048,16 +2066,16 @@ L(\type\()_8tap_hv):
4:
add \xmy, x11, \my, uxtw #3
- adr x10, L(\type\()_8tap_hv_tbl)
+ adrp x10, L(\type\()_8tap_hv_tbl)
+ add x10, x10, :lo12: L(\type\()_8tap_hv_tbl)
dup v30.4s, w12 // 6 - intermediate_bits
- ldrh w9, [x10, x9, lsl #1]
+ ldr x10, [x10, x9, lsl #3]
neg v30.4s, v30.4s // -(6-intermediate_bits)
.ifc \type, put
dup v29.4s, w13 // 6 + intermediate_bits
.else
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- sub x10, x10, w9, uxtw
.ifc \type, put
neg v29.4s, v29.4s // -(6+intermediate_bits)
.endif
@@ -2623,15 +2641,17 @@ L(\type\()_8tap_filter_8):
uzp1 v24.8h, v27.8h, v28.8h // Ditto
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_8tap_hv_tbl):
- .hword L(\type\()_8tap_hv_tbl) - 1280b
- .hword L(\type\()_8tap_hv_tbl) - 640b
- .hword L(\type\()_8tap_hv_tbl) - 320b
- .hword L(\type\()_8tap_hv_tbl) - 160b
- .hword L(\type\()_8tap_hv_tbl) - 80b
- .hword L(\type\()_8tap_hv_tbl) - 40b
- .hword L(\type\()_8tap_hv_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
endfunc
@@ -2665,16 +2685,16 @@ function \type\()_bilin_16bpc_neon, export=1
L(\type\()_bilin_h):
cbnz \my, L(\type\()_bilin_hv)
- adr x10, L(\type\()_bilin_h_tbl)
+ adrp x10, L(\type\()_bilin_h_tbl)
+ add x10, x10, :lo12: L(\type\()_bilin_h_tbl)
dup v31.8h, w11 // 4 - intermediate_bits
- ldrh w9, [x10, x9, lsl #1]
+ ldr x10, [x10, x9, lsl #3]
neg v31.8h, v31.8h // -(4-intermediate_bits)
.ifc \type, put
dup v30.8h, \bdmax // intermediate_bits
.else
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- sub x10, x10, w9, uxtw
.ifc \type, put
neg v30.8h, v30.8h // -intermediate_bits
.endif
@@ -2832,29 +2852,31 @@ L(\type\()_bilin_h):
b.gt 161b
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_bilin_h_tbl):
- .hword L(\type\()_bilin_h_tbl) - 1280b
- .hword L(\type\()_bilin_h_tbl) - 640b
- .hword L(\type\()_bilin_h_tbl) - 320b
- .hword L(\type\()_bilin_h_tbl) - 160b
- .hword L(\type\()_bilin_h_tbl) - 80b
- .hword L(\type\()_bilin_h_tbl) - 40b
- .hword L(\type\()_bilin_h_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
L(\type\()_bilin_v):
cmp \h, #4
- adr x10, L(\type\()_bilin_v_tbl)
+ adrp x10, L(\type\()_bilin_v_tbl)
+ add x10, x10, :lo12: L(\type\()_bilin_v_tbl)
.ifc \type, prep
dup v31.8h, w11 // 4 - intermediate_bits
.endif
- ldrh w9, [x10, x9, lsl #1]
+ ldr x10, [x10, x9, lsl #3]
.ifc \type, prep
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
neg v31.8h, v31.8h // -(4-intermediate_bits)
.endif
- sub x10, x10, w9, uxtw
br x10
20: // 2xN v
@@ -3030,27 +3052,29 @@ L(\type\()_bilin_v):
0:
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_bilin_v_tbl):
- .hword L(\type\()_bilin_v_tbl) - 1280b
- .hword L(\type\()_bilin_v_tbl) - 640b
- .hword L(\type\()_bilin_v_tbl) - 320b
- .hword L(\type\()_bilin_v_tbl) - 160b
- .hword L(\type\()_bilin_v_tbl) - 80b
- .hword L(\type\()_bilin_v_tbl) - 40b
- .hword L(\type\()_bilin_v_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
L(\type\()_bilin_hv):
- adr x10, L(\type\()_bilin_hv_tbl)
+ adrp x10, L(\type\()_bilin_hv_tbl)
+ add x10, x10, :lo12: L(\type\()_bilin_hv_tbl)
dup v31.8h, w11 // 4 - intermediate_bits
- ldrh w9, [x10, x9, lsl #1]
+ ldr x10, [x10, x9, lsl #3]
neg v31.8h, v31.8h // -(4-intermediate_bits)
.ifc \type, put
dup v30.4s, w12 // 4 + intermediate_bits
.else
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
.endif
- sub x10, x10, w9, uxtw
.ifc \type, put
neg v30.4s, v30.4s // -(4+intermediate_bits)
.endif
@@ -3224,15 +3248,17 @@ L(\type\()_bilin_hv):
0:
ret
+ .pushsection .data.rel.ro, "aw"
L(\type\()_bilin_hv_tbl):
- .hword L(\type\()_bilin_hv_tbl) - 1280b
- .hword L(\type\()_bilin_hv_tbl) - 640b
- .hword L(\type\()_bilin_hv_tbl) - 320b
- .hword L(\type\()_bilin_hv_tbl) - 160b
- .hword L(\type\()_bilin_hv_tbl) - 80b
- .hword L(\type\()_bilin_hv_tbl) - 40b
- .hword L(\type\()_bilin_hv_tbl) - 20b
- .hword 0
+ .xword 1280b
+ .xword 640b
+ .xword 320b
+ .xword 160b
+ .xword 80b
+ .xword 40b
+ .xword 20b
+ .xword 0
+ .popsection
endfunc
.endm