Index: src/arm/64/mc16.S --- src/arm/64/mc16.S.orig +++ src/arm/64/mc16.S @@ -145,11 +145,11 @@ function \type\()_16bpc_neon, export=1 dup v27.4s, w6 neg v27.4s, v27.4s .endif - adr x7, L(\type\()_tbl) + adrp x7, L(\type\()_tbl) + add x7, x7, :lo12: L(\type\()_tbl) sub w4, w4, #24 \type v4, v5, v0, v1, v2, v3 - ldrh w4, [x7, x4, lsl #1] - sub x7, x7, w4, uxtw + ldr x7, [x7, x4, lsl #3] br x7 40: AARCH64_VALID_JUMP_TARGET @@ -228,13 +228,15 @@ function \type\()_16bpc_neon, export=1 b 128b 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_tbl): - .hword L(\type\()_tbl) - 1280b - .hword L(\type\()_tbl) - 640b - .hword L(\type\()_tbl) - 32b - .hword L(\type\()_tbl) - 16b - .hword L(\type\()_tbl) - 80b - .hword L(\type\()_tbl) - 40b + .xword 1280b + .xword 640b + .xword 32b + .xword 16b + .xword 80b + .xword 40b + .popsection endfunc .endm @@ -247,12 +249,12 @@ bidir_fn mask, w7 function w_mask_\type\()_16bpc_neon, export=1 ldr w8, [sp] clz w9, w4 - adr x10, L(w_mask_\type\()_tbl) + adrp x10, L(w_mask_\type\()_tbl) + add x10, x10, :lo12: L(w_mask_\type\()_tbl) dup v31.8h, w8 // bitdepth_max sub w9, w9, #24 clz w8, w8 // clz(bitdepth_max) - ldrh w9, [x10, x9, lsl #1] - sub x10, x10, w9, uxtw + ldr x10, [x10, x9, lsl #3] sub w8, w8, #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12 mov w9, #PREP_BIAS*64 neg w8, w8 // -sh @@ -541,13 +543,15 @@ function w_mask_\type\()_16bpc_neon, export=1 add x12, x12, x1 b.gt 161b ret + .pushsection .data.rel.ro, "aw" L(w_mask_\type\()_tbl): - .hword L(w_mask_\type\()_tbl) - 1280b - .hword L(w_mask_\type\()_tbl) - 640b - .hword L(w_mask_\type\()_tbl) - 320b - .hword L(w_mask_\type\()_tbl) - 160b - .hword L(w_mask_\type\()_tbl) - 8b - .hword L(w_mask_\type\()_tbl) - 4b + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 8b + .xword 4b + .popsection endfunc .endm @@ -557,11 +561,11 @@ w_mask_fn 420 function blend_16bpc_neon, export=1 - adr x6, L(blend_tbl) + adrp x6, L(blend_tbl) + add x6, x6, :lo12: L(blend_tbl) clz w3, w3 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldr x6, [x6, x3, lsl #3] add x8, x0, x1 br x6 40: @@ -673,15 +677,18 @@ function blend_16bpc_neon, export=1 st1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1 b.gt 32b ret + .pushsection .data.rel.ro, "aw" L(blend_tbl): - .hword L(blend_tbl) - 32b - .hword L(blend_tbl) - 160b - .hword L(blend_tbl) - 80b - .hword L(blend_tbl) - 40b + .xword 32b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc function blend_h_16bpc_neon, export=1 - adr x6, L(blend_h_tbl) + adrp x6, L(blend_h_tbl) + add x6, x6, :lo12: L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 @@ -689,8 +696,7 @@ function blend_h_16bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 - ldrh w7, [x6, x7, lsl #1] - sub x6, x6, w7, uxtw + ldr x6, [x6, x7, lsl #3] br x6 2: AARCH64_VALID_JUMP_TARGET @@ -835,26 +841,28 @@ function blend_h_16bpc_neon, export=1 add x7, x7, w3, uxtw #1 b.gt 321b ret + .pushsection .data.rel.ro, "aw" L(blend_h_tbl): - .hword L(blend_h_tbl) - 1280b - .hword L(blend_h_tbl) - 640b - .hword L(blend_h_tbl) - 320b - .hword L(blend_h_tbl) - 16b - .hword L(blend_h_tbl) - 8b - .hword L(blend_h_tbl) - 4b - .hword L(blend_h_tbl) - 2b + .xword 1280b + .xword 640b + .xword 320b + .xword 16b + .xword 8b + .xword 4b + .xword 2b + .popsection endfunc function blend_v_16bpc_neon, export=1 - adr x6, L(blend_v_tbl) + adrp x6, L(blend_v_tbl) + add x6, x6, :lo12: L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldr x6, [x6, x3, lsl #3] br x6 20: AARCH64_VALID_JUMP_TARGET @@ -992,21 +1000,23 @@ function blend_v_16bpc_neon, export=1 st1 {v4.8h, v5.8h, v6.8h}, [x8], x1 b.gt 32b ret + .pushsection .data.rel.ro, "aw" L(blend_v_tbl): - .hword L(blend_v_tbl) - 320b - .hword L(blend_v_tbl) - 160b - .hword L(blend_v_tbl) - 80b - .hword L(blend_v_tbl) - 40b - .hword L(blend_v_tbl) - 20b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .popsection endfunc // This has got the same signature as the put_8tap functions, // and assumes that x9 is set to (clz(w)-24). function put_neon - adr x10, L(put_tbl) - ldrh w9, [x10, x9, lsl #1] - sub x10, x10, w9, uxtw + adrp x10, L(put_tbl) + add x10, x10, :lo12: L(put_tbl) + ldr x10, [x10, x9, lsl #3] br x10 2: @@ -1106,14 +1116,16 @@ function put_neon b.gt 128b ret + .pushsection .data.rel.ro, "aw" L(put_tbl): - .hword L(put_tbl) - 128b - .hword L(put_tbl) - 64b - .hword L(put_tbl) - 32b - .hword L(put_tbl) - 16b - .hword L(put_tbl) - 80b - .hword L(put_tbl) - 4b - .hword L(put_tbl) - 2b + .xword 128b + .xword 64b + .xword 32b + .xword 16b + .xword 80b + .xword 4b + .xword 2b + .popsection endfunc @@ -1121,11 +1133,11 @@ endfunc // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and // x8 to w*2. function prep_neon - adr x10, L(prep_tbl) - ldrh w9, [x10, x9, lsl #1] + adrp x10, L(prep_tbl) + add x10, x10, :lo12: L(prep_tbl) + ldr x10, [x10, x9, lsl #3] dup v31.8h, w7 // intermediate_bits movi v30.8h, #(PREP_BIAS >> 8), lsl #8 - sub x10, x10, w9, uxtw br x10 40: @@ -1278,13 +1290,15 @@ function prep_neon b.gt 128b ret + .pushsection .data.rel.ro, "aw" L(prep_tbl): - .hword L(prep_tbl) - 128b - .hword L(prep_tbl) - 64b - .hword L(prep_tbl) - 32b - .hword L(prep_tbl) - 16b - .hword L(prep_tbl) - 80b - .hword L(prep_tbl) - 40b + .xword 128b + .xword 64b + .xword 32b + .xword 16b + .xword 80b + .xword 40b + .popsection endfunc @@ -1563,16 +1577,16 @@ L(\type\()_8tap_h): add \xmx, x11, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) - adr x10, L(\type\()_8tap_h_tbl) + adrp x10, L(\type\()_8tap_h_tbl) + add x10, x10, :lo12: L(\type\()_8tap_h_tbl) dup v30.4s, w12 // 6 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldr x10, [x10, x9, lsl #3] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.8h, \bdmax // intermediate_bits .else movi v28.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw .ifc \type, put neg v29.8h, v29.8h // -intermediate_bits .endif @@ -1734,15 +1748,17 @@ L(\type\()_8tap_h): b.gt 81b ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_8tap_v): @@ -1758,12 +1774,12 @@ L(\type\()_8tap_v): dup v30.4s, w12 // 6 - intermediate_bits movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - adr x10, L(\type\()_8tap_v_tbl) - ldrh w9, [x10, x9, lsl #1] + adrp x10, L(\type\()_8tap_v_tbl) + add x10, x10, :lo12: L(\type\()_8tap_v_tbl) + ldr x10, [x10, x9, lsl #3] .ifc \type, prep neg v30.4s, v30.4s // -(6-intermediate_bits) .endif - sub x10, x10, w9, uxtw br x10 20: // 2xN v @@ -2029,15 +2045,17 @@ L(\type\()_8tap_v): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_8tap_hv): cmp \h, #4 @@ -2048,16 +2066,16 @@ L(\type\()_8tap_hv): 4: add \xmy, x11, \my, uxtw #3 - adr x10, L(\type\()_8tap_hv_tbl) + adrp x10, L(\type\()_8tap_hv_tbl) + add x10, x10, :lo12: L(\type\()_8tap_hv_tbl) dup v30.4s, w12 // 6 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldr x10, [x10, x9, lsl #3] neg v30.4s, v30.4s // -(6-intermediate_bits) .ifc \type, put dup v29.4s, w13 // 6 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw .ifc \type, put neg v29.4s, v29.4s // -(6+intermediate_bits) .endif @@ -2623,15 +2641,17 @@ L(\type\()_8tap_filter_8): uzp1 v24.8h, v27.8h, v28.8h // Ditto ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection endfunc @@ -2665,16 +2685,16 @@ function \type\()_bilin_16bpc_neon, export=1 L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) - adr x10, L(\type\()_bilin_h_tbl) + adrp x10, L(\type\()_bilin_h_tbl) + add x10, x10, :lo12: L(\type\()_bilin_h_tbl) dup v31.8h, w11 // 4 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldr x10, [x10, x9, lsl #3] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.8h, \bdmax // intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw .ifc \type, put neg v30.8h, v30.8h // -intermediate_bits .endif @@ -2832,29 +2852,31 @@ L(\type\()_bilin_h): b.gt 161b ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_h_tbl): - .hword L(\type\()_bilin_h_tbl) - 1280b - .hword L(\type\()_bilin_h_tbl) - 640b - .hword L(\type\()_bilin_h_tbl) - 320b - .hword L(\type\()_bilin_h_tbl) - 160b - .hword L(\type\()_bilin_h_tbl) - 80b - .hword L(\type\()_bilin_h_tbl) - 40b - .hword L(\type\()_bilin_h_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_bilin_v): cmp \h, #4 - adr x10, L(\type\()_bilin_v_tbl) + adrp x10, L(\type\()_bilin_v_tbl) + add x10, x10, :lo12: L(\type\()_bilin_v_tbl) .ifc \type, prep dup v31.8h, w11 // 4 - intermediate_bits .endif - ldrh w9, [x10, x9, lsl #1] + ldr x10, [x10, x9, lsl #3] .ifc \type, prep movi v29.8h, #(PREP_BIAS >> 8), lsl #8 neg v31.8h, v31.8h // -(4-intermediate_bits) .endif - sub x10, x10, w9, uxtw br x10 20: // 2xN v @@ -3030,27 +3052,29 @@ L(\type\()_bilin_v): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_v_tbl): - .hword L(\type\()_bilin_v_tbl) - 1280b - .hword L(\type\()_bilin_v_tbl) - 640b - .hword L(\type\()_bilin_v_tbl) - 320b - .hword L(\type\()_bilin_v_tbl) - 160b - .hword L(\type\()_bilin_v_tbl) - 80b - .hword L(\type\()_bilin_v_tbl) - 40b - .hword L(\type\()_bilin_v_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_bilin_hv): - adr x10, L(\type\()_bilin_hv_tbl) + adrp x10, L(\type\()_bilin_hv_tbl) + add x10, x10, :lo12: L(\type\()_bilin_hv_tbl) dup v31.8h, w11 // 4 - intermediate_bits - ldrh w9, [x10, x9, lsl #1] + ldr x10, [x10, x9, lsl #3] neg v31.8h, v31.8h // -(4-intermediate_bits) .ifc \type, put dup v30.4s, w12 // 4 + intermediate_bits .else movi v29.8h, #(PREP_BIAS >> 8), lsl #8 .endif - sub x10, x10, w9, uxtw .ifc \type, put neg v30.4s, v30.4s // -(4+intermediate_bits) .endif @@ -3224,15 +3248,17 @@ L(\type\()_bilin_hv): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_hv_tbl): - .hword L(\type\()_bilin_hv_tbl) - 1280b - .hword L(\type\()_bilin_hv_tbl) - 640b - .hword L(\type\()_bilin_hv_tbl) - 320b - .hword L(\type\()_bilin_hv_tbl) - 160b - .hword L(\type\()_bilin_hv_tbl) - 80b - .hword L(\type\()_bilin_hv_tbl) - 40b - .hword L(\type\()_bilin_hv_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection endfunc .endm