187 lines
7.4 KiB
Text
187 lines
7.4 KiB
Text
|
Index: src/arm/64/filmgrain16.S
|
||
|
--- src/arm/64/filmgrain16.S.orig
|
||
|
+++ src/arm/64/filmgrain16.S
|
||
|
@@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
|
||
|
add x4, x1, #FGD_AR_COEFFS_UV
|
||
|
.endif
|
||
|
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
|
||
|
- adr x16, L(gen_grain_\type\()_tbl)
|
||
|
+ adrp x16, L(gen_grain_\type\()_tbl)
|
||
|
+ add x16, x16, :lo12: L(gen_grain_\type\()_tbl)
|
||
|
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||
|
add w9, w9, #4
|
||
|
- ldrh w17, [x16, w17, uxtw #1]
|
||
|
+ ldr x16, [x16, w17, uxtw #3]
|
||
|
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
|
||
|
- sub x16, x16, w17, uxtw
|
||
|
neg v31.8h, v31.8h
|
||
|
|
||
|
.ifc \type, uv_444
|
||
|
@@ -946,11 +946,13 @@ L(generate_grain_\type\()_lag3):
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(gen_grain_\type\()_tbl):
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||
|
+ .xword L(generate_grain_\type\()_lag0)
|
||
|
+ .xword L(generate_grain_\type\()_lag1)
|
||
|
+ .xword L(generate_grain_\type\()_lag2)
|
||
|
+ .xword L(generate_grain_\type\()_lag3)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
.endm
|
||
|
|
||
|
@@ -991,12 +993,12 @@ function generate_grain_\type\()_16bpc_neon, export=1
|
||
|
ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT]
|
||
|
add x4, x1, #FGD_AR_COEFFS_UV
|
||
|
add w9, w9, w15 // grain_scale_shift - bitdepth_min_8
|
||
|
- adr x16, L(gen_grain_\type\()_tbl)
|
||
|
+ adrp x16, L(gen_grain_\type\()_tbl)
|
||
|
+ add x16, x16, :lo12: L(gen_grain_\type\()_tbl)
|
||
|
ldr w17, [x1, #FGD_AR_COEFF_LAG]
|
||
|
add w9, w9, #4
|
||
|
- ldrh w17, [x16, w17, uxtw #1]
|
||
|
+ ldr x16, [x16, w17, uxtw #3]
|
||
|
dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift
|
||
|
- sub x16, x16, w17, uxtw
|
||
|
neg v31.8h, v31.8h
|
||
|
|
||
|
cmp w13, #0
|
||
|
@@ -1156,11 +1158,13 @@ L(generate_grain_\type\()_lag3):
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(gen_grain_\type\()_tbl):
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2)
|
||
|
- .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3)
|
||
|
+ .xword L(generate_grain_\type\()_lag0)
|
||
|
+ .xword L(generate_grain_\type\()_lag1)
|
||
|
+ .xword L(generate_grain_\type\()_lag2)
|
||
|
+ .xword L(generate_grain_\type\()_lag3)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
.endm
|
||
|
|
||
|
@@ -1306,19 +1310,18 @@ function fgy_32x32_16bpc_neon, export=1
|
||
|
add_offset x5, w6, x10, x5, x9
|
||
|
|
||
|
ldr w11, [sp, #88] // type
|
||
|
- adr x13, L(fgy_loop_tbl)
|
||
|
+ adrp x13, L(fgy_loop_tbl)
|
||
|
+ add x13, x13, :lo12: L(fgy_loop_tbl)
|
||
|
|
||
|
add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||
|
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||
|
|
||
|
tst w11, #1
|
||
|
- ldrh w11, [x13, w11, uxtw #1]
|
||
|
+ ldr x11, [x13, w11, uxtw #3]
|
||
|
|
||
|
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||
|
add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||
|
|
||
|
- sub x11, x13, w11, uxtw
|
||
|
-
|
||
|
b.eq 1f
|
||
|
// y overlap
|
||
|
dup v8.8h, v27.h[0]
|
||
|
@@ -1481,11 +1484,13 @@ L(loop_\ox\oy):
|
||
|
fgy 1, 0
|
||
|
fgy 1, 1
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(fgy_loop_tbl):
|
||
|
- .hword L(fgy_loop_tbl) - L(loop_00)
|
||
|
- .hword L(fgy_loop_tbl) - L(loop_01)
|
||
|
- .hword L(fgy_loop_tbl) - L(loop_10)
|
||
|
- .hword L(fgy_loop_tbl) - L(loop_11)
|
||
|
+ .xword L(loop_00)
|
||
|
+ .xword L(loop_01)
|
||
|
+ .xword L(loop_10)
|
||
|
+ .xword L(loop_11)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
// void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
|
||
|
@@ -1589,11 +1594,12 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
||
|
ldr w13, [sp, #112] // type
|
||
|
|
||
|
movrel x16, overlap_coeffs_\sx
|
||
|
- adr x14, L(fguv_loop_sx\sx\()_tbl)
|
||
|
+ adrp x14, L(fguv_loop_sx\sx\()_tbl)
|
||
|
+ add x14, x14, :lo12: L(fguv_loop_sx\sx\()_tbl)
|
||
|
|
||
|
ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs
|
||
|
tst w13, #1
|
||
|
- ldrh w13, [x14, w13, uxtw #1]
|
||
|
+ ldr x13, [x14, w13, uxtw #3]
|
||
|
|
||
|
b.eq 1f
|
||
|
// y overlap
|
||
|
@@ -1601,8 +1607,6 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
||
|
mov w9, #(2 >> \sy)
|
||
|
|
||
|
1:
|
||
|
- sub x13, x14, w13, uxtw
|
||
|
-
|
||
|
.if \sy
|
||
|
movi v25.8h, #23
|
||
|
movi v26.8h, #22
|
||
|
@@ -1819,15 +1823,17 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(fguv_loop_sx0_tbl):
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10)
|
||
|
- .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11)
|
||
|
+ .xword L(fguv_loop_sx0_csfl0_00)
|
||
|
+ .xword L(fguv_loop_sx0_csfl0_01)
|
||
|
+ .xword L(fguv_loop_sx0_csfl0_10)
|
||
|
+ .xword L(fguv_loop_sx0_csfl0_11)
|
||
|
+ .xword L(fguv_loop_sx0_csfl1_00)
|
||
|
+ .xword L(fguv_loop_sx0_csfl1_01)
|
||
|
+ .xword L(fguv_loop_sx0_csfl1_10)
|
||
|
+ .xword L(fguv_loop_sx0_csfl1_11)
|
||
|
+ .popsection
|
||
|
endfunc
|
||
|
|
||
|
function fguv_loop_sx1_neon
|
||
|
@@ -1985,13 +1991,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
|
||
|
AARCH64_VALIDATE_LINK_REGISTER
|
||
|
ret
|
||
|
|
||
|
+ .pushsection .data.rel.ro, "aw"
|
||
|
L(fguv_loop_sx1_tbl):
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10)
|
||
|
- .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11)
|
||
|
+ .xword L(fguv_loop_sx1_csfl0_00)
|
||
|
+ .xword L(fguv_loop_sx1_csfl0_01)
|
||
|
+ .xword L(fguv_loop_sx1_csfl0_10)
|
||
|
+ .xword L(fguv_loop_sx1_csfl0_11)
|
||
|
+ .xword L(fguv_loop_sx1_csfl1_00)
|
||
|
+ .xword L(fguv_loop_sx1_csfl1_01)
|
||
|
+ .xword L(fguv_loop_sx1_csfl1_10)
|
||
|
+ .xword L(fguv_loop_sx1_csfl1_11)
|
||
|
+ .popsection
|
||
|
endfunc
|