523 lines
17 KiB
Text
523 lines
17 KiB
Text
Index: src/arm/64/mc16.S
|
|
--- src/arm/64/mc16.S.orig
|
|
+++ src/arm/64/mc16.S
|
|
@@ -145,11 +145,11 @@ function \type\()_16bpc_neon, export=1
|
|
dup v27.4s, w6
|
|
neg v27.4s, v27.4s
|
|
.endif
|
|
- adr x7, L(\type\()_tbl)
|
|
+ adrp x7, L(\type\()_tbl)
|
|
+ add x7, x7, :lo12: L(\type\()_tbl)
|
|
sub w4, w4, #24
|
|
\type v4, v5, v0, v1, v2, v3
|
|
- ldrh w4, [x7, x4, lsl #1]
|
|
- sub x7, x7, w4, uxtw
|
|
+ ldr x7, [x7, x4, lsl #3]
|
|
br x7
|
|
40:
|
|
AARCH64_VALID_JUMP_TARGET
|
|
@@ -228,13 +228,15 @@ function \type\()_16bpc_neon, export=1
|
|
b 128b
|
|
0:
|
|
ret
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_tbl):
|
|
- .hword L(\type\()_tbl) - 1280b
|
|
- .hword L(\type\()_tbl) - 640b
|
|
- .hword L(\type\()_tbl) - 32b
|
|
- .hword L(\type\()_tbl) - 16b
|
|
- .hword L(\type\()_tbl) - 80b
|
|
- .hword L(\type\()_tbl) - 40b
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 32b
|
|
+ .xword 16b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .popsection
|
|
endfunc
|
|
.endm
|
|
|
|
@@ -247,12 +249,12 @@ bidir_fn mask, w7
|
|
function w_mask_\type\()_16bpc_neon, export=1
|
|
ldr w8, [sp]
|
|
clz w9, w4
|
|
- adr x10, L(w_mask_\type\()_tbl)
|
|
+ adrp x10, L(w_mask_\type\()_tbl)
|
|
+ add x10, x10, :lo12: L(w_mask_\type\()_tbl)
|
|
dup v31.8h, w8 // bitdepth_max
|
|
sub w9, w9, #24
|
|
clz w8, w8 // clz(bitdepth_max)
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
- sub x10, x10, w9, uxtw
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
|
|
mov w9, #PREP_BIAS*64
|
|
neg w8, w8 // -sh
|
|
@@ -541,13 +543,15 @@ function w_mask_\type\()_16bpc_neon, export=1
|
|
add x12, x12, x1
|
|
b.gt 161b
|
|
ret
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(w_mask_\type\()_tbl):
|
|
- .hword L(w_mask_\type\()_tbl) - 1280b
|
|
- .hword L(w_mask_\type\()_tbl) - 640b
|
|
- .hword L(w_mask_\type\()_tbl) - 320b
|
|
- .hword L(w_mask_\type\()_tbl) - 160b
|
|
- .hword L(w_mask_\type\()_tbl) - 8b
|
|
- .hword L(w_mask_\type\()_tbl) - 4b
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 8b
|
|
+ .xword 4b
|
|
+ .popsection
|
|
endfunc
|
|
.endm
|
|
|
|
@@ -557,11 +561,11 @@ w_mask_fn 420
|
|
|
|
|
|
function blend_16bpc_neon, export=1
|
|
- adr x6, L(blend_tbl)
|
|
+ adrp x6, L(blend_tbl)
|
|
+ add x6, x6, :lo12: L(blend_tbl)
|
|
clz w3, w3
|
|
sub w3, w3, #26
|
|
- ldrh w3, [x6, x3, lsl #1]
|
|
- sub x6, x6, w3, uxtw
|
|
+ ldr x6, [x6, x3, lsl #3]
|
|
add x8, x0, x1
|
|
br x6
|
|
40:
|
|
@@ -673,15 +677,18 @@ function blend_16bpc_neon, export=1
|
|
st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
|
|
b.gt 32b
|
|
ret
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(blend_tbl):
|
|
- .hword L(blend_tbl) - 32b
|
|
- .hword L(blend_tbl) - 160b
|
|
- .hword L(blend_tbl) - 80b
|
|
- .hword L(blend_tbl) - 40b
|
|
+ .xword 32b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .popsection
|
|
endfunc
|
|
|
|
function blend_h_16bpc_neon, export=1
|
|
- adr x6, L(blend_h_tbl)
|
|
+ adrp x6, L(blend_h_tbl)
|
|
+ add x6, x6, :lo12: L(blend_h_tbl)
|
|
movrel x5, X(obmc_masks)
|
|
add x5, x5, w4, uxtw
|
|
sub w4, w4, w4, lsr #2
|
|
@@ -689,8 +696,7 @@ function blend_h_16bpc_neon, export=1
|
|
add x8, x0, x1
|
|
lsl x1, x1, #1
|
|
sub w7, w7, #24
|
|
- ldrh w7, [x6, x7, lsl #1]
|
|
- sub x6, x6, w7, uxtw
|
|
+ ldr x6, [x6, x7, lsl #3]
|
|
br x6
|
|
2:
|
|
AARCH64_VALID_JUMP_TARGET
|
|
@@ -835,26 +841,28 @@ function blend_h_16bpc_neon, export=1
|
|
add x7, x7, w3, uxtw #1
|
|
b.gt 321b
|
|
ret
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(blend_h_tbl):
|
|
- .hword L(blend_h_tbl) - 1280b
|
|
- .hword L(blend_h_tbl) - 640b
|
|
- .hword L(blend_h_tbl) - 320b
|
|
- .hword L(blend_h_tbl) - 16b
|
|
- .hword L(blend_h_tbl) - 8b
|
|
- .hword L(blend_h_tbl) - 4b
|
|
- .hword L(blend_h_tbl) - 2b
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 16b
|
|
+ .xword 8b
|
|
+ .xword 4b
|
|
+ .xword 2b
|
|
+ .popsection
|
|
endfunc
|
|
|
|
function blend_v_16bpc_neon, export=1
|
|
- adr x6, L(blend_v_tbl)
|
|
+ adrp x6, L(blend_v_tbl)
|
|
+ add x6, x6, :lo12: L(blend_v_tbl)
|
|
movrel x5, X(obmc_masks)
|
|
add x5, x5, w3, uxtw
|
|
clz w3, w3
|
|
add x8, x0, x1
|
|
lsl x1, x1, #1
|
|
sub w3, w3, #26
|
|
- ldrh w3, [x6, x3, lsl #1]
|
|
- sub x6, x6, w3, uxtw
|
|
+ ldr x6, [x6, x3, lsl #3]
|
|
br x6
|
|
20:
|
|
AARCH64_VALID_JUMP_TARGET
|
|
@@ -992,21 +1000,23 @@ function blend_v_16bpc_neon, export=1
|
|
st1 {v4.8h, v5.8h, v6.8h}, [x8], x1
|
|
b.gt 32b
|
|
ret
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(blend_v_tbl):
|
|
- .hword L(blend_v_tbl) - 320b
|
|
- .hword L(blend_v_tbl) - 160b
|
|
- .hword L(blend_v_tbl) - 80b
|
|
- .hword L(blend_v_tbl) - 40b
|
|
- .hword L(blend_v_tbl) - 20b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .popsection
|
|
endfunc
|
|
|
|
|
|
// This has got the same signature as the put_8tap functions,
|
|
// and assumes that x9 is set to (clz(w)-24).
|
|
function put_neon
|
|
- adr x10, L(put_tbl)
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
- sub x10, x10, w9, uxtw
|
|
+ adrp x10, L(put_tbl)
|
|
+ add x10, x10, :lo12: L(put_tbl)
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
br x10
|
|
|
|
2:
|
|
@@ -1106,14 +1116,16 @@ function put_neon
|
|
b.gt 128b
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(put_tbl):
|
|
- .hword L(put_tbl) - 128b
|
|
- .hword L(put_tbl) - 64b
|
|
- .hword L(put_tbl) - 32b
|
|
- .hword L(put_tbl) - 16b
|
|
- .hword L(put_tbl) - 80b
|
|
- .hword L(put_tbl) - 4b
|
|
- .hword L(put_tbl) - 2b
|
|
+ .xword 128b
|
|
+ .xword 64b
|
|
+ .xword 32b
|
|
+ .xword 16b
|
|
+ .xword 80b
|
|
+ .xword 4b
|
|
+ .xword 2b
|
|
+ .popsection
|
|
endfunc
|
|
|
|
|
|
@@ -1121,11 +1133,11 @@ endfunc
|
|
// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
|
|
// x8 to w*2.
|
|
function prep_neon
|
|
- adr x10, L(prep_tbl)
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ adrp x10, L(prep_tbl)
|
|
+ add x10, x10, :lo12: L(prep_tbl)
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
dup v31.8h, w7 // intermediate_bits
|
|
movi v30.8h, #(PREP_BIAS >> 8), lsl #8
|
|
- sub x10, x10, w9, uxtw
|
|
br x10
|
|
|
|
40:
|
|
@@ -1278,13 +1290,15 @@ function prep_neon
|
|
b.gt 128b
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(prep_tbl):
|
|
- .hword L(prep_tbl) - 128b
|
|
- .hword L(prep_tbl) - 64b
|
|
- .hword L(prep_tbl) - 32b
|
|
- .hword L(prep_tbl) - 16b
|
|
- .hword L(prep_tbl) - 80b
|
|
- .hword L(prep_tbl) - 40b
|
|
+ .xword 128b
|
|
+ .xword 64b
|
|
+ .xword 32b
|
|
+ .xword 16b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .popsection
|
|
endfunc
|
|
|
|
|
|
@@ -1563,16 +1577,16 @@ L(\type\()_8tap_h):
|
|
add \xmx, x11, \mx, uxtw #3
|
|
b.ne L(\type\()_8tap_hv)
|
|
|
|
- adr x10, L(\type\()_8tap_h_tbl)
|
|
+ adrp x10, L(\type\()_8tap_h_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_8tap_h_tbl)
|
|
dup v30.4s, w12 // 6 - intermediate_bits
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
neg v30.4s, v30.4s // -(6-intermediate_bits)
|
|
.ifc \type, put
|
|
dup v29.8h, \bdmax // intermediate_bits
|
|
.else
|
|
movi v28.8h, #(PREP_BIAS >> 8), lsl #8
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
.ifc \type, put
|
|
neg v29.8h, v29.8h // -intermediate_bits
|
|
.endif
|
|
@@ -1734,15 +1748,17 @@ L(\type\()_8tap_h):
|
|
b.gt 81b
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_8tap_h_tbl):
|
|
- .hword L(\type\()_8tap_h_tbl) - 1280b
|
|
- .hword L(\type\()_8tap_h_tbl) - 640b
|
|
- .hword L(\type\()_8tap_h_tbl) - 320b
|
|
- .hword L(\type\()_8tap_h_tbl) - 160b
|
|
- .hword L(\type\()_8tap_h_tbl) - 80b
|
|
- .hword L(\type\()_8tap_h_tbl) - 40b
|
|
- .hword L(\type\()_8tap_h_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
|
|
|
|
L(\type\()_8tap_v):
|
|
@@ -1758,12 +1774,12 @@ L(\type\()_8tap_v):
|
|
dup v30.4s, w12 // 6 - intermediate_bits
|
|
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
|
|
.endif
|
|
- adr x10, L(\type\()_8tap_v_tbl)
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ adrp x10, L(\type\()_8tap_v_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_8tap_v_tbl)
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
.ifc \type, prep
|
|
neg v30.4s, v30.4s // -(6-intermediate_bits)
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
br x10
|
|
|
|
20: // 2xN v
|
|
@@ -2029,15 +2045,17 @@ L(\type\()_8tap_v):
|
|
0:
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_8tap_v_tbl):
|
|
- .hword L(\type\()_8tap_v_tbl) - 1280b
|
|
- .hword L(\type\()_8tap_v_tbl) - 640b
|
|
- .hword L(\type\()_8tap_v_tbl) - 320b
|
|
- .hword L(\type\()_8tap_v_tbl) - 160b
|
|
- .hword L(\type\()_8tap_v_tbl) - 80b
|
|
- .hword L(\type\()_8tap_v_tbl) - 40b
|
|
- .hword L(\type\()_8tap_v_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
|
|
L(\type\()_8tap_hv):
|
|
cmp \h, #4
|
|
@@ -2048,16 +2066,16 @@ L(\type\()_8tap_hv):
|
|
4:
|
|
add \xmy, x11, \my, uxtw #3
|
|
|
|
- adr x10, L(\type\()_8tap_hv_tbl)
|
|
+ adrp x10, L(\type\()_8tap_hv_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_8tap_hv_tbl)
|
|
dup v30.4s, w12 // 6 - intermediate_bits
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
neg v30.4s, v30.4s // -(6-intermediate_bits)
|
|
.ifc \type, put
|
|
dup v29.4s, w13 // 6 + intermediate_bits
|
|
.else
|
|
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
.ifc \type, put
|
|
neg v29.4s, v29.4s // -(6+intermediate_bits)
|
|
.endif
|
|
@@ -2623,15 +2641,17 @@ L(\type\()_8tap_filter_8):
|
|
uzp1 v24.8h, v27.8h, v28.8h // Ditto
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_8tap_hv_tbl):
|
|
- .hword L(\type\()_8tap_hv_tbl) - 1280b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 640b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 320b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 160b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 80b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 40b
|
|
- .hword L(\type\()_8tap_hv_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
endfunc
|
|
|
|
|
|
@@ -2665,16 +2685,16 @@ function \type\()_bilin_16bpc_neon, export=1
|
|
L(\type\()_bilin_h):
|
|
cbnz \my, L(\type\()_bilin_hv)
|
|
|
|
- adr x10, L(\type\()_bilin_h_tbl)
|
|
+ adrp x10, L(\type\()_bilin_h_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_bilin_h_tbl)
|
|
dup v31.8h, w11 // 4 - intermediate_bits
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
neg v31.8h, v31.8h // -(4-intermediate_bits)
|
|
.ifc \type, put
|
|
dup v30.8h, \bdmax // intermediate_bits
|
|
.else
|
|
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
.ifc \type, put
|
|
neg v30.8h, v30.8h // -intermediate_bits
|
|
.endif
|
|
@@ -2832,29 +2852,31 @@ L(\type\()_bilin_h):
|
|
b.gt 161b
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_bilin_h_tbl):
|
|
- .hword L(\type\()_bilin_h_tbl) - 1280b
|
|
- .hword L(\type\()_bilin_h_tbl) - 640b
|
|
- .hword L(\type\()_bilin_h_tbl) - 320b
|
|
- .hword L(\type\()_bilin_h_tbl) - 160b
|
|
- .hword L(\type\()_bilin_h_tbl) - 80b
|
|
- .hword L(\type\()_bilin_h_tbl) - 40b
|
|
- .hword L(\type\()_bilin_h_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
|
|
|
|
L(\type\()_bilin_v):
|
|
cmp \h, #4
|
|
- adr x10, L(\type\()_bilin_v_tbl)
|
|
+ adrp x10, L(\type\()_bilin_v_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_bilin_v_tbl)
|
|
.ifc \type, prep
|
|
dup v31.8h, w11 // 4 - intermediate_bits
|
|
.endif
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
.ifc \type, prep
|
|
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
|
|
neg v31.8h, v31.8h // -(4-intermediate_bits)
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
br x10
|
|
|
|
20: // 2xN v
|
|
@@ -3030,27 +3052,29 @@ L(\type\()_bilin_v):
|
|
0:
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_bilin_v_tbl):
|
|
- .hword L(\type\()_bilin_v_tbl) - 1280b
|
|
- .hword L(\type\()_bilin_v_tbl) - 640b
|
|
- .hword L(\type\()_bilin_v_tbl) - 320b
|
|
- .hword L(\type\()_bilin_v_tbl) - 160b
|
|
- .hword L(\type\()_bilin_v_tbl) - 80b
|
|
- .hword L(\type\()_bilin_v_tbl) - 40b
|
|
- .hword L(\type\()_bilin_v_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
|
|
L(\type\()_bilin_hv):
|
|
- adr x10, L(\type\()_bilin_hv_tbl)
|
|
+ adrp x10, L(\type\()_bilin_hv_tbl)
|
|
+ add x10, x10, :lo12: L(\type\()_bilin_hv_tbl)
|
|
dup v31.8h, w11 // 4 - intermediate_bits
|
|
- ldrh w9, [x10, x9, lsl #1]
|
|
+ ldr x10, [x10, x9, lsl #3]
|
|
neg v31.8h, v31.8h // -(4-intermediate_bits)
|
|
.ifc \type, put
|
|
dup v30.4s, w12 // 4 + intermediate_bits
|
|
.else
|
|
movi v29.8h, #(PREP_BIAS >> 8), lsl #8
|
|
.endif
|
|
- sub x10, x10, w9, uxtw
|
|
.ifc \type, put
|
|
neg v30.4s, v30.4s // -(4+intermediate_bits)
|
|
.endif
|
|
@@ -3224,15 +3248,17 @@ L(\type\()_bilin_hv):
|
|
0:
|
|
ret
|
|
|
|
+ .pushsection .data.rel.ro, "aw"
|
|
L(\type\()_bilin_hv_tbl):
|
|
- .hword L(\type\()_bilin_hv_tbl) - 1280b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 640b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 320b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 160b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 80b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 40b
|
|
- .hword L(\type\()_bilin_hv_tbl) - 20b
|
|
- .hword 0
|
|
+ .xword 1280b
|
|
+ .xword 640b
|
|
+ .xword 320b
|
|
+ .xword 160b
|
|
+ .xword 80b
|
|
+ .xword 40b
|
|
+ .xword 20b
|
|
+ .xword 0
|
|
+ .popsection
|
|
endfunc
|
|
.endm
|
|
|