Index: src/arm/64/ipred.S --- src/arm/64/ipred.S.orig +++ src/arm/64/ipred.S @@ -34,11 +34,11 @@ // const int max_width, const int max_height); function ipred_dc_128_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_dc_128_tbl) + adrp x5, L(ipred_dc_128_tbl) + add x5, x5, :lo12: L(ipred_dc_128_tbl) sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldr x5, [x5, w3, uxtw #3] movi v0.16b, #128 - sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -94,12 +94,14 @@ function ipred_dc_128_8bpc_neon, export=1 b.gt 64b ret + .pushsection .data.rel.ro, "aw" L(ipred_dc_128_tbl): - .hword L(ipred_dc_128_tbl) - 640b - .hword L(ipred_dc_128_tbl) - 320b - .hword L(ipred_dc_128_tbl) - 16b - .hword L(ipred_dc_128_tbl) - 8b - .hword L(ipred_dc_128_tbl) - 4b + .xword 640b + .xword 320b + .xword 16b + .xword 8b + .xword 4b + .popsection endfunc // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -108,11 +110,11 @@ endfunc // const int max_width, const int max_height); function ipred_v_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_v_tbl) + adrp x5, L(ipred_v_tbl) + add x5, x5, :lo12: L(ipred_v_tbl) sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldr x5, [x5, w3, uxtw #3] add x2, x2, #1 - sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -172,12 +174,14 @@ function ipred_v_8bpc_neon, export=1 b.gt 64b ret + .pushsection .data.rel.ro, "aw" L(ipred_v_tbl): - .hword L(ipred_v_tbl) - 640b - .hword L(ipred_v_tbl) - 320b - .hword L(ipred_v_tbl) - 160b - .hword L(ipred_v_tbl) - 80b - .hword L(ipred_v_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -186,11 +190,11 @@ endfunc // const int max_width, const int max_height); function ipred_h_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_h_tbl) + adrp x5, L(ipred_h_tbl) + add x5, x5, :lo12: L(ipred_h_tbl) sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldr x5, [x5, w3, uxtw #3] sub x2, x2, #4 - sub x5, x5, w3, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 @@ -258,12 +262,14 @@ function ipred_h_8bpc_neon, export=1 b.gt 64b ret + .pushsection .data.rel.ro, "aw" L(ipred_h_tbl): - .hword L(ipred_h_tbl) - 64b - .hword L(ipred_h_tbl) - 32b - .hword L(ipred_h_tbl) - 16b - .hword L(ipred_h_tbl) - 8b - .hword L(ipred_h_tbl) - 4b + .xword 64b + .xword 32b + .xword 16b + .xword 8b + .xword 4b + .popsection endfunc // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -272,11 +278,11 @@ endfunc // const int max_width, const int max_height); function ipred_dc_top_8bpc_neon, export=1 clz w3, w3 - adr x5, L(ipred_dc_top_tbl) + adrp x5, L(ipred_dc_top_tbl) + add x5, x5, :lo12: L(ipred_dc_top_tbl) sub w3, w3, #25 - ldrh w3, [x5, w3, uxtw #1] + ldr x5, [x5, w3, uxtw #3] add x2, x2, #1 - sub x5, x5, w3, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -363,12 +369,14 @@ function ipred_dc_top_8bpc_neon, export=1 b.gt 64b ret + .pushsection .data.rel.ro, "aw" L(ipred_dc_top_tbl): - .hword L(ipred_dc_top_tbl) - 640b - .hword L(ipred_dc_top_tbl) - 320b - .hword L(ipred_dc_top_tbl) - 160b - .hword L(ipred_dc_top_tbl) - 80b - .hword L(ipred_dc_top_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -379,13 +387,12 @@ function ipred_dc_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w3, w3 clz w7, w4 - adr x5, L(ipred_dc_left_tbl) + adrp x5, L(ipred_dc_left_tbl) + add x5, x5, :lo12: L(ipred_dc_left_tbl) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w7, w7, #25 - ldrh w3, [x5, w3, uxtw #1] - ldrh w7, [x5, w7, uxtw #1] - sub x3, x5, w3, uxtw - sub x5, x5, w7, uxtw + ldr x3, [x5, w3, uxtw #3] + ldr x5, [x5, w7, uxtw #3] add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -489,17 +496,19 @@ L(ipred_dc_left_w64): b.gt 1b ret + .pushsection .data.rel.ro, "aw" L(ipred_dc_left_tbl): - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8) - .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4) + .xword L(ipred_dc_left_h64) + .xword L(ipred_dc_left_h32) + .xword L(ipred_dc_left_h16) + .xword L(ipred_dc_left_h8) + .xword L(ipred_dc_left_h4) + .xword L(ipred_dc_left_w64) + .xword L(ipred_dc_left_w32) + .xword L(ipred_dc_left_w16) + .xword L(ipred_dc_left_w8) + .xword L(ipred_dc_left_w4) + .popsection endfunc // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -512,16 +521,15 @@ function ipred_dc_8bpc_neon, export=1 clz w3, w3 clz w6, w4 dup v16.8h, w7 // width + height - adr x5, L(ipred_dc_tbl) + adrp x5, L(ipred_dc_tbl) + add x5, x5, :lo12: L(ipred_dc_tbl) rbit w7, w7 // rbit(width + height) sub w3, w3, #20 // 25 leading bits, minus table offset 5 sub w6, w6, #25 clz w7, w7 // ctz(width + height) - ldrh w3, [x5, w3, uxtw #1] - ldrh w6, [x5, w6, uxtw #1] + ldr x3, [x5, w3, uxtw #3] + ldr x5, [x5, w6, uxtw #3] neg w7, w7 // -ctz(width + height) - sub x3, x5, w3, uxtw - sub x5, x5, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w7 // -ctz(width + height) add x6, x0, x1 @@ -714,17 +722,19 @@ L(ipred_dc_w64): b.gt 2b ret + .pushsection .data.rel.ro, "aw" L(ipred_dc_tbl): - .hword L(ipred_dc_tbl) - L(ipred_dc_h64) - .hword L(ipred_dc_tbl) - L(ipred_dc_h32) - .hword L(ipred_dc_tbl) - L(ipred_dc_h16) - .hword L(ipred_dc_tbl) - L(ipred_dc_h8) - .hword L(ipred_dc_tbl) - L(ipred_dc_h4) - .hword L(ipred_dc_tbl) - L(ipred_dc_w64) - .hword L(ipred_dc_tbl) - L(ipred_dc_w32) - .hword L(ipred_dc_tbl) - L(ipred_dc_w16) - .hword L(ipred_dc_tbl) - L(ipred_dc_w8) - .hword L(ipred_dc_tbl) - L(ipred_dc_w4) + .xword L(ipred_dc_h64) + .xword L(ipred_dc_h32) + .xword L(ipred_dc_h16) + .xword L(ipred_dc_h8) + .xword L(ipred_dc_h4) + .xword L(ipred_dc_w64) + .xword L(ipred_dc_w32) + .xword L(ipred_dc_w16) + .xword L(ipred_dc_w8) + .xword L(ipred_dc_w4) + .popsection endfunc // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -733,13 +743,13 @@ endfunc // const int max_width, const int max_height); function ipred_paeth_8bpc_neon, export=1 clz w9, w3 - adr x5, L(ipred_paeth_tbl) + adrp x5, L(ipred_paeth_tbl) + add x5, x5, :lo12: L(ipred_paeth_tbl) sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldr x5, [x5, w9, uxtw #3] ld1r {v4.16b}, [x2] add x8, x2, #1 sub x2, x2, #4 - sub x5, x5, w9, uxtw mov x7, #-4 add x6, x0, x1 lsl x1, x1, #1 @@ -899,12 +909,14 @@ function ipred_paeth_8bpc_neon, export=1 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_paeth_tbl): - .hword L(ipred_paeth_tbl) - 640b - .hword L(ipred_paeth_tbl) - 320b - .hword L(ipred_paeth_tbl) - 160b - .hword L(ipred_paeth_tbl) - 80b - .hword L(ipred_paeth_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -916,13 +928,13 @@ function ipred_smooth_8bpc_neon, export=1 add x11, x10, w4, uxtw add x10, x10, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_tbl) + adrp x5, L(ipred_smooth_tbl) + add x5, x5, :lo12: L(ipred_smooth_tbl) sub x12, x2, w4, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldr x5, [x5, w9, uxtw #3] ld1r {v4.16b}, [x12] // bottom add x8, x2, #1 - sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1080,12 +1092,14 @@ function ipred_smooth_8bpc_neon, export=1 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_smooth_tbl): - .hword L(ipred_smooth_tbl) - 640b - .hword L(ipred_smooth_tbl) - 320b - .hword L(ipred_smooth_tbl) - 160b - .hword L(ipred_smooth_tbl) - 80b - .hword L(ipred_smooth_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -1096,13 +1110,13 @@ function ipred_smooth_v_8bpc_neon, export=1 movrel x7, X(sm_weights) add x7, x7, w4, uxtw clz w9, w3 - adr x5, L(ipred_smooth_v_tbl) + adrp x5, L(ipred_smooth_v_tbl) + add x5, x5, :lo12: L(ipred_smooth_v_tbl) sub x8, x2, w4, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldr x5, [x5, w9, uxtw #3] ld1r {v4.16b}, [x8] // bottom add x2, x2, #1 - sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1221,12 +1235,14 @@ function ipred_smooth_v_8bpc_neon, export=1 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_smooth_v_tbl): - .hword L(ipred_smooth_v_tbl) - 640b - .hword L(ipred_smooth_v_tbl) - 320b - .hword L(ipred_smooth_v_tbl) - 160b - .hword L(ipred_smooth_v_tbl) - 80b - .hword L(ipred_smooth_v_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -1237,12 +1253,12 @@ function ipred_smooth_h_8bpc_neon, export=1 movrel x8, X(sm_weights) add x8, x8, w3, uxtw clz w9, w3 - adr x5, L(ipred_smooth_h_tbl) + adrp x5, L(ipred_smooth_h_tbl) + add x5, x5, :lo12: L(ipred_smooth_h_tbl) add x12, x2, w3, uxtw sub w9, w9, #25 - ldrh w9, [x5, w9, uxtw #1] + ldr x5, [x5, w9, uxtw #3] ld1r {v5.16b}, [x12] // right - sub x5, x5, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x5 @@ -1367,12 +1383,14 @@ function ipred_smooth_h_8bpc_neon, export=1 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_smooth_h_tbl): - .hword L(ipred_smooth_h_tbl) - 640b - .hword L(ipred_smooth_h_tbl) - 320b - .hword L(ipred_smooth_h_tbl) - 160b - .hword L(ipred_smooth_h_tbl) - 80b - .hword L(ipred_smooth_h_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc const padding_mask_buf @@ -1653,11 +1671,11 @@ endfunc // const int dx, const int max_base_x); function ipred_z1_fill1_8bpc_neon, export=1 clz w9, w3 - adr x8, L(ipred_z1_fill1_tbl) + adrp x8, L(ipred_z1_fill1_tbl) + add x8, x8, :lo12: L(ipred_z1_fill1_tbl) sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldr x8, [x8, w9, uxtw #3] add x10, x2, w6, uxtw // top[max_base_x] - sub x8, x8, w9, uxtw ld1r {v31.16b}, [x10] // padding mov w7, w5 mov w15, #64 @@ -1816,12 +1834,14 @@ function ipred_z1_fill1_8bpc_neon, export=1 mov w3, w12 b 169b + .pushsection .data.rel.ro, "aw" L(ipred_z1_fill1_tbl): - .hword L(ipred_z1_fill1_tbl) - 640b - .hword L(ipred_z1_fill1_tbl) - 320b - .hword L(ipred_z1_fill1_tbl) - 160b - .hword L(ipred_z1_fill1_tbl) - 80b - .hword L(ipred_z1_fill1_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc function ipred_z1_fill2_8bpc_neon, export=1 @@ -1940,11 +1960,11 @@ endconst // const int dx, const int dy); function ipred_z2_fill1_8bpc_neon, export=1 clz w10, w4 - adr x9, L(ipred_z2_fill1_tbl) + adrp x9, L(ipred_z2_fill1_tbl) + add x9, x9, :lo12: L(ipred_z2_fill1_tbl) sub w10, w10, #25 - ldrh w10, [x9, w10, uxtw #1] + ldr x9, [x9, w10, uxtw #3] mov w8, #(1 << 6) // xpos = 1 << 6 - sub x9, x9, w10, uxtw sub w8, w8, w6 // xpos -= dx movrel x11, increments @@ -2651,12 +2671,14 @@ function ipred_z2_fill1_8bpc_neon, export=1 ldp d8, d9, [sp], 0x40 ret + .pushsection .data.rel.ro, "aw" L(ipred_z2_fill1_tbl): - .hword L(ipred_z2_fill1_tbl) - 640b - .hword L(ipred_z2_fill1_tbl) - 320b - .hword L(ipred_z2_fill1_tbl) - 160b - .hword L(ipred_z2_fill1_tbl) - 80b - .hword L(ipred_z2_fill1_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc function ipred_z2_fill2_8bpc_neon, export=1 @@ -3160,11 +3182,11 @@ endfunc function ipred_z3_fill1_8bpc_neon, export=1 cmp w6, #64 clz w9, w3 - adr x8, L(ipred_z3_fill1_tbl) + adrp x8, L(ipred_z3_fill1_tbl) + add x8, x8, :lo12: L(ipred_z3_fill1_tbl) sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] + ldr x8, [x8, w9, uxtw #3] add x10, x2, w6, uxtw // left[max_base_y] - sub x8, x8, w9, uxtw movrel x11, increments ld1r {v31.16b}, [x10] // padding ld1 {v30.8h}, [x11] // increments @@ -3503,17 +3525,20 @@ L(ipred_z3_fill1_large_h16): 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_z3_fill1_tbl): - .hword L(ipred_z3_fill1_tbl) - 640b - .hword L(ipred_z3_fill1_tbl) - 320b - .hword L(ipred_z3_fill1_tbl) - 160b - .hword L(ipred_z3_fill1_tbl) - 80b - .hword L(ipred_z3_fill1_tbl) - 40b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc function ipred_z3_fill_padding_neon, export=0 cmp w3, #16 - adr x8, L(ipred_z3_fill_padding_tbl) + adrp x8, L(ipred_z3_fill_padding_tbl) + add x8, x8, :lo12: L(ipred_z3_fill_padding_tbl) b.gt L(ipred_z3_fill_padding_wide) // w3 = remaining width, w4 = constant height mov w12, w4 @@ -3524,10 +3549,11 @@ function ipred_z3_fill_padding_neon, export=0 // power of two in the remaining width, and repeating. clz w9, w3 sub w9, w9, #25 - ldrh w9, [x8, w9, uxtw #1] - sub x9, x8, w9, uxtw + ldr x9, [x8, w9, uxtw #3] br x9 +20: + AARCH64_VALID_JUMP_TARGET 2: st1 {v31.h}[0], [x0], x1 subs w4, w4, #4 @@ -3546,6 +3572,8 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b +40: + AARCH64_VALID_JUMP_TARGET 4: st1 {v31.s}[0], [x0], x1 subs w4, w4, #4 @@ -3564,7 +3592,8 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -8: +80: + AARCH64_VALID_JUMP_TARGET st1 {v31.8b}, [x0], x1 subs w4, w4, #4 st1 {v31.8b}, [x13], x1 @@ -3582,9 +3611,10 @@ function ipred_z3_fill_padding_neon, export=0 mov w4, w12 b 1b -16: -32: -64: +160: +320: +640: + AARCH64_VALID_JUMP_TARGET st1 {v31.16b}, [x0], x1 subs w4, w4, #4 st1 {v31.16b}, [x13], x1 @@ -3605,13 +3635,15 @@ function ipred_z3_fill_padding_neon, export=0 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_z3_fill_padding_tbl): - .hword L(ipred_z3_fill_padding_tbl) - 64b - .hword L(ipred_z3_fill_padding_tbl) - 32b - .hword L(ipred_z3_fill_padding_tbl) - 16b - .hword L(ipred_z3_fill_padding_tbl) - 8b - .hword L(ipred_z3_fill_padding_tbl) - 4b - .hword L(ipred_z3_fill_padding_tbl) - 2b + .xword 640b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .xword 20b + .popsection L(ipred_z3_fill_padding_wide): // Fill a WxH rectangle with padding, with W > 16. @@ -3766,13 +3798,13 @@ function ipred_filter_8bpc_neon, export=1 add x6, x6, w5, uxtw ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32 clz w9, w3 - adr x5, L(ipred_filter_tbl) + adrp x5, L(ipred_filter_tbl) + add x5, x5, :lo12: L(ipred_filter_tbl) ld1 {v20.8b, v21.8b, v22.8b}, [x6] sub w9, w9, #26 - ldrh w9, [x5, w9, uxtw #1] + ldr x5, [x5, w9, uxtw #3] sxtl v16.8h, v16.8b sxtl v17.8h, v17.8b - sub x5, x5, w9, uxtw sxtl v18.8h, v18.8b sxtl v19.8h, v19.8b add x6, x0, x1 @@ -3913,11 +3945,13 @@ function ipred_filter_8bpc_neon, export=1 9: ret + .pushsection .data.rel.ro, "aw" L(ipred_filter_tbl): - .hword L(ipred_filter_tbl) - 320b - .hword L(ipred_filter_tbl) - 160b - .hword L(ipred_filter_tbl) - 80b - .hword L(ipred_filter_tbl) - 40b + .xword 320b + .xword 160b + .xword 80b + .xword 40b + .popsection endfunc // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -3926,11 +3960,11 @@ endfunc function pal_pred_8bpc_neon, export=1 ld1 {v0.8h}, [x2] clz w9, w4 - adr x6, L(pal_pred_tbl) + adrp x6, L(pal_pred_tbl) + add x6, x6, :lo12: L(pal_pred_tbl) sub w9, w9, #25 - ldrh w9, [x6, w9, uxtw #1] + ldr x6, [x6, w9, uxtw #3] xtn v0.8b, v0.8h - sub x6, x6, w9, uxtw add x2, x0, x1 lsl x1, x1, #1 br x6 @@ -4008,12 +4042,14 @@ function pal_pred_8bpc_neon, export=1 b.gt 64b ret + .pushsection .data.rel.ro, "aw" L(pal_pred_tbl): - .hword L(pal_pred_tbl) - 64b - .hword L(pal_pred_tbl) - 32b - .hword L(pal_pred_tbl) - 16b - .hword L(pal_pred_tbl) - 8b - .hword L(pal_pred_tbl) - 4b + .xword 64b + .xword 32b + .xword 16b + .xword 8b + .xword 4b + .popsection endfunc // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -4022,12 +4058,12 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_128_8bpc_neon, export=1 clz w9, w3 - adr x7, L(ipred_cfl_128_tbl) + adrp x7, L(ipred_cfl_128_tbl) + add x7, x7, :lo12: L(ipred_cfl_128_tbl) sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldr x7, [x7, w9, uxtw #3] movi v0.8h, #128 // dc dup v1.8h, w6 // alpha - sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 @@ -4132,12 +4168,14 @@ L(ipred_cfl_splat_w16): b.gt 1b ret + .pushsection .data.rel.ro, "aw" L(ipred_cfl_128_tbl): L(ipred_cfl_splat_tbl): - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8) - .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4) + .xword L(ipred_cfl_splat_w16) + .xword L(ipred_cfl_splat_w16) + .xword L(ipred_cfl_splat_w8) + .xword L(ipred_cfl_splat_w4) + .popsection endfunc // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -4146,12 +4184,12 @@ endfunc // const int16_t *ac, const int alpha); function ipred_cfl_top_8bpc_neon, export=1 clz w9, w3 - adr x7, L(ipred_cfl_top_tbl) + adrp x7, L(ipred_cfl_top_tbl) + add x7, x7, :lo12: L(ipred_cfl_top_tbl) sub w9, w9, #26 - ldrh w9, [x7, w9, uxtw #1] + ldr x7, [x7, w9, uxtw #3] dup v1.8h, w6 // alpha add x2, x2, #1 - sub x7, x7, w9, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 @@ -4186,11 +4224,13 @@ function ipred_cfl_top_8bpc_neon, export=1 dup v0.8h, v2.h[0] b L(ipred_cfl_splat_w16) + .pushsection .data.rel.ro, "aw" L(ipred_cfl_top_tbl): - .hword L(ipred_cfl_top_tbl) - 32b - .hword L(ipred_cfl_top_tbl) - 16b - .hword L(ipred_cfl_top_tbl) - 8b - .hword L(ipred_cfl_top_tbl) - 4b + .xword 32b + .xword 16b + .xword 8b + .xword 4b + .popsection endfunc // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -4201,15 +4241,15 @@ function ipred_cfl_left_8bpc_neon, export=1 sub x2, x2, w4, uxtw clz w9, w3 clz w8, w4 - adr x10, L(ipred_cfl_splat_tbl) - adr x7, L(ipred_cfl_left_tbl) + adrp x10, L(ipred_cfl_splat_tbl) + add x10, x10, :lo12: L(ipred_cfl_splat_tbl) + adrp x7, L(ipred_cfl_left_tbl) + add x7, x7, :lo12: L(ipred_cfl_left_tbl) sub w9, w9, #26 sub w8, w8, #26 - ldrh w9, [x10, w9, uxtw #1] - ldrh w8, [x7, w8, uxtw #1] + ldr x9, [x10, w9, uxtw #3] + ldr x7, [x7, w8, uxtw #3] dup v1.8h, w6 // alpha - sub x9, x10, w9, uxtw - sub x7, x7, w8, uxtw add x6, x0, x1 lsl x1, x1, #1 br x7 @@ -4248,11 +4288,13 @@ L(ipred_cfl_left_h32): dup v0.8h, v2.h[0] br x9 + .pushsection .data.rel.ro, "aw" L(ipred_cfl_left_tbl): - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8) - .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4) + .xword L(ipred_cfl_left_h32) + .xword L(ipred_cfl_left_h16) + .xword L(ipred_cfl_left_h8) + .xword L(ipred_cfl_left_h4) + .popsection endfunc // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride, @@ -4266,16 +4308,15 @@ function ipred_cfl_8bpc_neon, export=1 clz w9, w3 clz w6, w4 dup v16.8h, w8 // width + height - adr x7, L(ipred_cfl_tbl) + adrp x7, L(ipred_cfl_tbl) + add x7, x7, :lo12: L(ipred_cfl_tbl) rbit w8, w8 // rbit(width + height) sub w9, w9, #22 // 26 leading bits, minus table offset 4 sub w6, w6, #26 clz w8, w8 // ctz(width + height) - ldrh w9, [x7, w9, uxtw #1] - ldrh w6, [x7, w6, uxtw #1] + ldr x9, [x7, w9, uxtw #3] + ldr x7, [x7, w6, uxtw #3] neg w8, w8 // -ctz(width + height) - sub x9, x7, w9, uxtw - sub x7, x7, w6, uxtw ushr v16.8h, v16.8h, #1 // (width + height) >> 1 dup v17.8h, w8 // -ctz(width + height) add x6, x0, x1 @@ -4392,15 +4433,17 @@ L(ipred_cfl_w32): dup v0.8h, v0.h[0] b L(ipred_cfl_splat_w16) + .pushsection .data.rel.ro, "aw" L(ipred_cfl_tbl): - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8) - .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4) + .xword L(ipred_cfl_h32) + .xword L(ipred_cfl_h16) + .xword L(ipred_cfl_h8) + .xword L(ipred_cfl_h4) + .xword L(ipred_cfl_w32) + .xword L(ipred_cfl_w16) + .xword L(ipred_cfl_w8) + .xword L(ipred_cfl_w4) + .popsection endfunc // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx, @@ -4409,14 +4452,14 @@ endfunc function ipred_cfl_ac_420_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_420_tbl) + adrp x7, L(ipred_cfl_ac_420_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_420_tbl) sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldr x7, [x7, w8, uxtw #3] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -4555,9 +4598,9 @@ L(ipred_cfl_ac_420_w8_subtract_dc): L(ipred_cfl_ac_420_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_420_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + adrp x7, L(ipred_cfl_ac_420_w16_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_420_w16_tbl) + ldr x7, [x7, w3, uxtw #3] br x7 L(ipred_cfl_ac_420_w16_wpad0): @@ -4714,17 +4757,19 @@ L(ipred_cfl_ac_420_w16_hpad): lsl w6, w6, #1 b L(ipred_cfl_ac_420_w8_calc_subtract_dc) + .pushsection .data.rel.ro, "aw" L(ipred_cfl_ac_420_tbl): - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8) - .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4) - .hword 0 + .xword L(ipred_cfl_ac_420_w16) + .xword L(ipred_cfl_ac_420_w8) + .xword L(ipred_cfl_ac_420_w4) + .xword 0 L(ipred_cfl_ac_420_w16_tbl): - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2) - .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3) + .xword L(ipred_cfl_ac_420_w16_wpad0) + .xword L(ipred_cfl_ac_420_w16_wpad1) + .xword L(ipred_cfl_ac_420_w16_wpad2) + .xword L(ipred_cfl_ac_420_w16_wpad3) + .popsection endfunc // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx, @@ -4733,14 +4778,14 @@ endfunc function ipred_cfl_ac_422_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_422_tbl) + adrp x7, L(ipred_cfl_ac_422_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_422_tbl) sub w8, w8, #27 - ldrh w8, [x7, w8, uxtw #1] + ldr x7, [x7, w8, uxtw #3] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -4831,9 +4876,9 @@ L(ipred_cfl_ac_422_w8_wpad): L(ipred_cfl_ac_422_w16): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_422_w16_tbl) - ldrh w3, [x7, w3, uxtw #1] - sub x7, x7, w3, uxtw + adrp x7, L(ipred_cfl_ac_422_w16_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_422_w16_tbl) + ldr x7, [x7, w3, uxtw #3] br x7 L(ipred_cfl_ac_422_w16_wpad0): @@ -4936,17 +4981,19 @@ L(ipred_cfl_ac_422_w16_wpad3): mov v1.16b, v3.16b b L(ipred_cfl_ac_420_w16_hpad) + .pushsection .data.rel.ro, "aw" L(ipred_cfl_ac_422_tbl): - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8) - .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4) - .hword 0 + .xword L(ipred_cfl_ac_422_w16) + .xword L(ipred_cfl_ac_422_w8) + .xword L(ipred_cfl_ac_422_w4) + .xword 0 L(ipred_cfl_ac_422_w16_tbl): - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2) - .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3) + .xword L(ipred_cfl_ac_422_w16_wpad0) + .xword L(ipred_cfl_ac_422_w16_wpad1) + .xword L(ipred_cfl_ac_422_w16_wpad2) + .xword L(ipred_cfl_ac_422_w16_wpad3) + .popsection endfunc // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx, @@ -4955,14 +5002,14 @@ endfunc function ipred_cfl_ac_444_8bpc_neon, export=1 clz w8, w5 lsl w4, w4, #2 - adr x7, L(ipred_cfl_ac_444_tbl) + adrp x7, L(ipred_cfl_ac_444_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_444_tbl) sub w8, w8, #26 - ldrh w8, [x7, w8, uxtw #1] + ldr x7, [x7, w8, uxtw #3] movi v16.8h, #0 movi v17.8h, #0 movi v18.8h, #0 movi v19.8h, #0 - sub x7, x7, w8, uxtw sub w8, w6, w4 // height - h_pad rbit w9, w5 // rbit(width) rbit w10, w6 // rbit(height) @@ -5083,9 +5130,10 @@ L(ipred_cfl_ac_444_w16_wpad): L(ipred_cfl_ac_444_w32): AARCH64_VALID_JUMP_TARGET - adr x7, L(ipred_cfl_ac_444_w32_tbl) - ldrh w3, [x7, w3, uxtw] // (w3>>1) << 1 - sub x7, x7, w3, uxtw + adrp x7, L(ipred_cfl_ac_444_w32_tbl) + add x7, x7, :lo12: L(ipred_cfl_ac_444_w32_tbl) + lsr w3, w3, #1 + ldr x7, [x7, w3, uxtw #3] // (w3>>1) << 3 br x7 L(ipred_cfl_ac_444_w32_wpad0): @@ -5231,15 +5279,17 @@ L(ipred_cfl_ac_444_w32_hpad): dup v4.8h, v4.h[0] b L(ipred_cfl_ac_420_w8_subtract_dc) + .pushsection .data.rel.ro, "aw" L(ipred_cfl_ac_444_tbl): - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8) - .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4) + .xword L(ipred_cfl_ac_444_w32) + .xword L(ipred_cfl_ac_444_w16) + .xword L(ipred_cfl_ac_444_w8) + .xword L(ipred_cfl_ac_444_w4) L(ipred_cfl_ac_444_w32_tbl): - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4) - .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6) + .xword L(ipred_cfl_ac_444_w32_wpad0) + .xword L(ipred_cfl_ac_444_w32_wpad2) + .xword L(ipred_cfl_ac_444_w32_wpad4) + .xword L(ipred_cfl_ac_444_w32_wpad6) + .popsection endfunc