Index: src/arm/64/mc.S --- src/arm/64/mc.S.orig +++ src/arm/64/mc.S @@ -79,11 +79,11 @@ function \type\()_8bpc_neon, export=1 .ifc \type, mask movi v31.16b, #256-2 .endif - adr x7, L(\type\()_tbl) + adrp x7, L(\type\()_tbl) + add x7, x7, :lo12: L(\type\()_tbl) sub w4, w4, #24 - ldrh w4, [x7, x4, lsl #1] + ldr x7, [x7, x4, lsl #3] \type v4, v0, v1, v2, v3 - sub x7, x7, w4, uxtw br x7 40: AARCH64_VALID_JUMP_TARGET @@ -192,13 +192,15 @@ function \type\()_8bpc_neon, export=1 b 128b 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_tbl): - .hword L(\type\()_tbl) - 1280b - .hword L(\type\()_tbl) - 640b - .hword L(\type\()_tbl) - 320b - .hword L(\type\()_tbl) - 16b - .hword L(\type\()_tbl) - 80b - .hword L(\type\()_tbl) - 40b + .xword 1280b + .xword 640b + .xword 320b + .xword 16b + .xword 80b + .xword 40b + .popsection endfunc .endm @@ -210,10 +212,10 @@ bidir_fn mask .macro w_mask_fn type function w_mask_\type\()_8bpc_neon, export=1 clz w8, w4 - adr x9, L(w_mask_\type\()_tbl) + adrp x9, L(w_mask_\type\()_tbl) + add x9, x9, :lo12: L(w_mask_\type\()_tbl) sub w8, w8, #24 - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + ldr x9, [x9, x8, lsl #3] mov w10, #6903 dup v0.8h, w10 .if \type == 444 @@ -413,13 +415,15 @@ function w_mask_\type\()_8bpc_neon, export=1 add x12, x12, x1 b.gt 161b ret + .pushsection .data.rel.ro, "aw" L(w_mask_\type\()_tbl): - .hword L(w_mask_\type\()_tbl) - 1280b - .hword L(w_mask_\type\()_tbl) - 640b - .hword L(w_mask_\type\()_tbl) - 320b - .hword L(w_mask_\type\()_tbl) - 160b - .hword L(w_mask_\type\()_tbl) - 8b - .hword L(w_mask_\type\()_tbl) - 4b + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 8b + .xword 4b + .popsection endfunc .endm @@ -429,11 +433,11 @@ w_mask_fn 420 function blend_8bpc_neon, export=1 - adr x6, L(blend_tbl) + adrp x6, L(blend_tbl) + add x6, x6, :lo12: L(blend_tbl) clz w3, w3 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldr x6, [x6, x3, lsl #3] movi v4.16b, #64 add x8, x0, x1 lsl x1, x1, #1 @@ -535,15 +539,18 @@ function blend_8bpc_neon, export=1 st1 {v27.16b, v28.16b}, [x8], x1 b.gt 32b ret + .pushsection .data.rel.ro, "aw" L(blend_tbl): - .hword L(blend_tbl) - 32b - .hword L(blend_tbl) - 16b - .hword L(blend_tbl) - 8b - .hword L(blend_tbl) - 4b + .xword 32b + .xword 16b + .xword 8b + .xword 4b + .popsection endfunc function blend_h_8bpc_neon, export=1 - adr x6, L(blend_h_tbl) + adrp x6, L(blend_h_tbl) + add x6, x6, :lo12: L(blend_h_tbl) movrel x5, X(obmc_masks) add x5, x5, w4, uxtw sub w4, w4, w4, lsr #2 @@ -552,8 +559,7 @@ function blend_h_8bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w7, w7, #24 - ldrh w7, [x6, x7, lsl #1] - sub x6, x6, w7, uxtw + ldr x6, [x6, x7, lsl #3] br x6 2: AARCH64_VALID_JUMP_TARGET @@ -682,18 +688,21 @@ function blend_h_8bpc_neon, export=1 add x7, x7, w3, uxtw b.gt 321b ret + .pushsection .data.rel.ro, "aw" L(blend_h_tbl): - .hword L(blend_h_tbl) - 1280b - .hword L(blend_h_tbl) - 640b - .hword L(blend_h_tbl) - 320b - .hword L(blend_h_tbl) - 16b - .hword L(blend_h_tbl) - 8b - .hword L(blend_h_tbl) - 4b - .hword L(blend_h_tbl) - 2b + .xword 1280b + .xword 640b + .xword 320b + .xword 16b + .xword 8b + .xword 4b + .xword 2b + .popsection endfunc function blend_v_8bpc_neon, export=1 - adr x6, L(blend_v_tbl) + adrp x6, L(blend_v_tbl) + add x6, x6, :lo12: L(blend_v_tbl) movrel x5, X(obmc_masks) add x5, x5, w3, uxtw clz w3, w3 @@ -701,8 +710,7 @@ function blend_v_8bpc_neon, export=1 add x8, x0, x1 lsl x1, x1, #1 sub w3, w3, #26 - ldrh w3, [x6, x3, lsl #1] - sub x6, x6, w3, uxtw + ldr x6, [x6, x3, lsl #3] br x6 20: AARCH64_VALID_JUMP_TARGET @@ -826,21 +834,23 @@ function blend_v_8bpc_neon, export=1 st1 {v27.8b}, [x8], x1 b.gt 32b ret + .pushsection .data.rel.ro, "aw" L(blend_v_tbl): - .hword L(blend_v_tbl) - 320b - .hword L(blend_v_tbl) - 160b - .hword L(blend_v_tbl) - 80b - .hword L(blend_v_tbl) - 40b - .hword L(blend_v_tbl) - 20b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .popsection endfunc // This has got the same signature as the put_8tap functions, // and assumes that x8 is set to (clz(w)-24). function put_neon - adr x9, L(put_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(put_tbl) + add x9, x9, :lo12: L(put_tbl) + ldr x9, [x9, x8, lsl #3] br x9 2: @@ -926,23 +936,25 @@ function put_neon b.gt 128b ret + .pushsection .data.rel.ro, "aw" L(put_tbl): - .hword L(put_tbl) - 128b - .hword L(put_tbl) - 64b - .hword L(put_tbl) - 32b - .hword L(put_tbl) - 160b - .hword L(put_tbl) - 8b - .hword L(put_tbl) - 4b - .hword L(put_tbl) - 2b + .xword 128b + .xword 64b + .xword 32b + .xword 160b + .xword 8b + .xword 4b + .xword 2b + .popsection endfunc // This has got the same signature as the prep_8tap functions, // and assumes that x8 is set to (clz(w)-24), and x7 to w*2. function prep_neon - adr x9, L(prep_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(prep_tbl) + add x9, x9, :lo12: L(prep_tbl) + ldr x9, [x9, x8, lsl #3] br x9 4: @@ -1058,13 +1070,15 @@ function prep_neon b.gt 128b ret + .pushsection .data.rel.ro, "aw" L(prep_tbl): - .hword L(prep_tbl) - 1280b - .hword L(prep_tbl) - 640b - .hword L(prep_tbl) - 320b - .hword L(prep_tbl) - 160b - .hword L(prep_tbl) - 8b - .hword L(prep_tbl) - 4b + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 8b + .xword 4b + .popsection endfunc @@ -1370,9 +1384,9 @@ L(\type\()_8tap_h): add \xmx, x10, \mx, uxtw #3 b.ne L(\type\()_8tap_hv) - adr x9, L(\type\()_8tap_h_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_8tap_h_tbl) + add x9, x9, :lo12: L(\type\()_8tap_h_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: // 2xN h @@ -1575,15 +1589,17 @@ L(\type\()_8tap_h): b.gt 161b ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_h_tbl): - .hword L(\type\()_8tap_h_tbl) - 1280b - .hword L(\type\()_8tap_h_tbl) - 640b - .hword L(\type\()_8tap_h_tbl) - 320b - .hword L(\type\()_8tap_h_tbl) - 160b - .hword L(\type\()_8tap_h_tbl) - 80b - .hword L(\type\()_8tap_h_tbl) - 40b - .hword L(\type\()_8tap_h_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_8tap_v): @@ -1595,9 +1611,9 @@ L(\type\()_8tap_v): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_v_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_8tap_v_tbl) + add x9, x9, :lo12: L(\type\()_8tap_v_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: // 2xN v @@ -1901,15 +1917,17 @@ L(\type\()_8tap_v): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_v_tbl): - .hword L(\type\()_8tap_v_tbl) - 1280b - .hword L(\type\()_8tap_v_tbl) - 640b - .hword L(\type\()_8tap_v_tbl) - 320b - .hword L(\type\()_8tap_v_tbl) - 160b - .hword L(\type\()_8tap_v_tbl) - 80b - .hword L(\type\()_8tap_v_tbl) - 40b - .hword L(\type\()_8tap_v_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_8tap_hv): cmp \h, #4 @@ -1920,9 +1938,9 @@ L(\type\()_8tap_hv): 4: add \xmy, x10, \my, uxtw #3 - adr x9, L(\type\()_8tap_hv_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_8tap_hv_tbl) + add x9, x9, :lo12: L(\type\()_8tap_hv_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: @@ -2444,15 +2462,17 @@ L(\type\()_8tap_filter_8): srshr v25.8h, v25.8h, #2 ret + .pushsection .data.rel.ro, "aw" L(\type\()_8tap_hv_tbl): - .hword L(\type\()_8tap_hv_tbl) - 1280b - .hword L(\type\()_8tap_hv_tbl) - 640b - .hword L(\type\()_8tap_hv_tbl) - 320b - .hword L(\type\()_8tap_hv_tbl) - 160b - .hword L(\type\()_8tap_hv_tbl) - 80b - .hword L(\type\()_8tap_hv_tbl) - 40b - .hword L(\type\()_8tap_hv_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection endfunc @@ -2478,9 +2498,9 @@ function \type\()_bilin_8bpc_neon, export=1 L(\type\()_bilin_h): cbnz \my, L(\type\()_bilin_hv) - adr x9, L(\type\()_bilin_h_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_bilin_h_tbl) + add x9, x9, :lo12: L(\type\()_bilin_h_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: // 2xN h @@ -2624,22 +2644,24 @@ L(\type\()_bilin_h): b.gt 161b ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_h_tbl): - .hword L(\type\()_bilin_h_tbl) - 1280b - .hword L(\type\()_bilin_h_tbl) - 640b - .hword L(\type\()_bilin_h_tbl) - 320b - .hword L(\type\()_bilin_h_tbl) - 160b - .hword L(\type\()_bilin_h_tbl) - 80b - .hword L(\type\()_bilin_h_tbl) - 40b - .hword L(\type\()_bilin_h_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_bilin_v): cmp \h, #4 - adr x9, L(\type\()_bilin_v_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_bilin_v_tbl) + add x9, x9, :lo12: L(\type\()_bilin_v_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: // 2xN v @@ -2810,22 +2832,24 @@ L(\type\()_bilin_v): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_v_tbl): - .hword L(\type\()_bilin_v_tbl) - 1280b - .hword L(\type\()_bilin_v_tbl) - 640b - .hword L(\type\()_bilin_v_tbl) - 320b - .hword L(\type\()_bilin_v_tbl) - 160b - .hword L(\type\()_bilin_v_tbl) - 80b - .hword L(\type\()_bilin_v_tbl) - 40b - .hword L(\type\()_bilin_v_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection L(\type\()_bilin_hv): uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b - adr x9, L(\type\()_bilin_hv_tbl) - ldrh w8, [x9, x8, lsl #1] - sub x9, x9, w8, uxtw + adrp x9, L(\type\()_bilin_hv_tbl) + add x9, x9, :lo12: L(\type\()_bilin_hv_tbl) + ldr x9, [x9, x8, lsl #3] br x9 20: // 2xN hv @@ -2975,15 +2999,17 @@ L(\type\()_bilin_hv): 0: ret + .pushsection .data.rel.ro, "aw" L(\type\()_bilin_hv_tbl): - .hword L(\type\()_bilin_hv_tbl) - 1280b - .hword L(\type\()_bilin_hv_tbl) - 640b - .hword L(\type\()_bilin_hv_tbl) - 320b - .hword L(\type\()_bilin_hv_tbl) - 160b - .hword L(\type\()_bilin_hv_tbl) - 80b - .hword L(\type\()_bilin_hv_tbl) - 40b - .hword L(\type\()_bilin_hv_tbl) - 20b - .hword 0 + .xword 1280b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .xword 0 + .popsection endfunc .endm