Index: src/arm/64/filmgrain16.S --- src/arm/64/filmgrain16.S.orig +++ src/arm/64/filmgrain16.S @@ -740,12 +740,12 @@ function generate_grain_\type\()_16bpc_neon, export=1 add x4, x1, #FGD_AR_COEFFS_UV .endif add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) + adrp x16, L(gen_grain_\type\()_tbl) + add x16, x16, :lo12: L(gen_grain_\type\()_tbl) ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldr x16, [x16, w17, uxtw #3] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw neg v31.8h, v31.8h .ifc \type, uv_444 @@ -946,11 +946,13 @@ L(generate_grain_\type\()_lag3): AARCH64_VALIDATE_LINK_REGISTER ret + .pushsection .data.rel.ro, "aw" L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) + .xword L(generate_grain_\type\()_lag0) + .xword L(generate_grain_\type\()_lag1) + .xword L(generate_grain_\type\()_lag2) + .xword L(generate_grain_\type\()_lag3) + .popsection endfunc .endm @@ -991,12 +993,12 @@ function generate_grain_\type\()_16bpc_neon, export=1 ldr w9, [x1, #FGD_GRAIN_SCALE_SHIFT] add x4, x1, #FGD_AR_COEFFS_UV add w9, w9, w15 // grain_scale_shift - bitdepth_min_8 - adr x16, L(gen_grain_\type\()_tbl) + adrp x16, L(gen_grain_\type\()_tbl) + add x16, x16, :lo12: L(gen_grain_\type\()_tbl) ldr w17, [x1, #FGD_AR_COEFF_LAG] add w9, w9, #4 - ldrh w17, [x16, w17, uxtw #1] + ldr x16, [x16, w17, uxtw #3] dup v31.8h, w9 // 4 - bitdepth_min_8 + data->grain_scale_shift - sub x16, x16, w17, uxtw neg v31.8h, v31.8h cmp w13, #0 @@ -1156,11 +1158,13 @@ L(generate_grain_\type\()_lag3): AARCH64_VALIDATE_LINK_REGISTER ret + .pushsection .data.rel.ro, "aw" L(gen_grain_\type\()_tbl): - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag0) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag1) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag2) - .hword L(gen_grain_\type\()_tbl) - L(generate_grain_\type\()_lag3) + .xword L(generate_grain_\type\()_lag0) + .xword L(generate_grain_\type\()_lag1) + .xword L(generate_grain_\type\()_lag2) + .xword L(generate_grain_\type\()_lag3) + .popsection endfunc .endm @@ -1306,19 +1310,18 @@ function fgy_32x32_16bpc_neon, export=1 add_offset x5, w6, x10, x5, x9 ldr w11, [sp, #88] // type - adr x13, L(fgy_loop_tbl) + adrp x13, L(fgy_loop_tbl) + add x13, x13, :lo12: L(fgy_loop_tbl) add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by tst w11, #1 - ldrh w11, [x13, w11, uxtw #1] + ldr x11, [x13, w11, uxtw #3] add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx - sub x11, x13, w11, uxtw - b.eq 1f // y overlap dup v8.8h, v27.h[0] @@ -1481,11 +1484,13 @@ L(loop_\ox\oy): fgy 1, 0 fgy 1, 1 + .pushsection .data.rel.ro, "aw" L(fgy_loop_tbl): - .hword L(fgy_loop_tbl) - L(loop_00) - .hword L(fgy_loop_tbl) - L(loop_01) - .hword L(fgy_loop_tbl) - L(loop_10) - .hword L(fgy_loop_tbl) - L(loop_11) + .xword L(loop_00) + .xword L(loop_01) + .xword L(loop_10) + .xword L(loop_11) + .popsection endfunc // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst, @@ -1589,11 +1594,12 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 ldr w13, [sp, #112] // type movrel x16, overlap_coeffs_\sx - adr x14, L(fguv_loop_sx\sx\()_tbl) + adrp x14, L(fguv_loop_sx\sx\()_tbl) + add x14, x14, :lo12: L(fguv_loop_sx\sx\()_tbl) ld1 {v27.4h, v28.4h}, [x16] // overlap_coeffs tst w13, #1 - ldrh w13, [x14, w13, uxtw #1] + ldr x13, [x14, w13, uxtw #3] b.eq 1f // y overlap @@ -1601,8 +1607,6 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1 mov w9, #(2 >> \sy) 1: - sub x13, x14, w13, uxtw - .if \sy movi v25.8h, #23 movi v26.8h, #22 @@ -1819,15 +1823,17 @@ L(fguv_loop_sx0_csfl\csfl\()_\ox\oy): AARCH64_VALIDATE_LINK_REGISTER ret + .pushsection .data.rel.ro, "aw" L(fguv_loop_sx0_tbl): - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl0_11) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_00) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_01) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_10) - .hword L(fguv_loop_sx0_tbl) - L(fguv_loop_sx0_csfl1_11) + .xword L(fguv_loop_sx0_csfl0_00) + .xword L(fguv_loop_sx0_csfl0_01) + .xword L(fguv_loop_sx0_csfl0_10) + .xword L(fguv_loop_sx0_csfl0_11) + .xword L(fguv_loop_sx0_csfl1_00) + .xword L(fguv_loop_sx0_csfl1_01) + .xword L(fguv_loop_sx0_csfl1_10) + .xword L(fguv_loop_sx0_csfl1_11) + .popsection endfunc function fguv_loop_sx1_neon @@ -1985,13 +1991,15 @@ L(fguv_loop_sx1_csfl\csfl\()_\ox\oy): AARCH64_VALIDATE_LINK_REGISTER ret + .pushsection .data.rel.ro, "aw" L(fguv_loop_sx1_tbl): - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl0_11) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_00) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_01) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_10) - .hword L(fguv_loop_sx1_tbl) - L(fguv_loop_sx1_csfl1_11) + .xword L(fguv_loop_sx1_csfl0_00) + .xword L(fguv_loop_sx1_csfl0_01) + .xword L(fguv_loop_sx1_csfl0_10) + .xword L(fguv_loop_sx1_csfl0_11) + .xword L(fguv_loop_sx1_csfl1_00) + .xword L(fguv_loop_sx1_csfl1_01) + .xword L(fguv_loop_sx1_csfl1_10) + .xword L(fguv_loop_sx1_csfl1_11) + .popsection endfunc