ports/multimedia/dav1d/patches/patch-src_arm_64_ipred16_S

Index: src/arm/64/ipred16.S
--- src/arm/64/ipred16.S.orig
+++ src/arm/64/ipred16.S
@@ -36,11 +36,11 @@
 function ipred_dc_128_16bpc_neon, export=1
         ldr             w8,  [sp]
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_128_tbl)
+        adrp            x5,  L(ipred_dc_128_tbl)
+        add             x5,  x5, :lo12: L(ipred_dc_128_tbl)
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldr             x5,  [x5, w3, uxtw #3]
         dup             v0.8h,   w8
-        sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         urshr           v0.8h,   v0.8h,  #1
@@ -106,12 +106,14 @@ function ipred_dc_128_16bpc_neon, export=1
         b.gt            64b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_dc_128_tbl):
-        .hword L(ipred_dc_128_tbl) - 640b
-        .hword L(ipred_dc_128_tbl) - 320b
-        .hword L(ipred_dc_128_tbl) - 160b
-        .hword L(ipred_dc_128_tbl) -   8b
-        .hword L(ipred_dc_128_tbl) -   4b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword   8b
+        .xword   4b
+	.popsection
 endfunc

 // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -120,11 +122,11 @@ endfunc
 //                         const int max_width, const int max_height);
 function ipred_v_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_v_tbl)
+        adrp            x5,  L(ipred_v_tbl)
+        add             x5,  x5, :lo12: L(ipred_v_tbl)
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldr             x5,  [x5, w3, uxtw #3]
         add             x2,  x2,  #2
-        sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -190,12 +192,14 @@ function ipred_v_16bpc_neon, export=1
         b.gt            64b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_v_tbl):
-        .hword L(ipred_v_tbl) - 640b
-        .hword L(ipred_v_tbl) - 320b
-        .hword L(ipred_v_tbl) - 160b
-        .hword L(ipred_v_tbl) -  80b
-        .hword L(ipred_v_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -204,11 +208,11 @@ endfunc
 //                         const int max_width, const int max_height);
 function ipred_h_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_h_tbl)
+        adrp            x5,  L(ipred_h_tbl)
+        add             x5,  x5, :lo12: L(ipred_h_tbl)
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldr             x5,  [x5, w3, uxtw #3]
         sub             x2,  x2,  #8
-        sub             x5,  x5,  w3, uxtw
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
@@ -292,12 +296,14 @@ function ipred_h_16bpc_neon, export=1
         b.gt            64b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_h_tbl):
-        .hword L(ipred_h_tbl) - 64b
-        .hword L(ipred_h_tbl) - 32b
-        .hword L(ipred_h_tbl) - 16b
-        .hword L(ipred_h_tbl) -  8b
-        .hword L(ipred_h_tbl) -  4b
+        .xword 64b
+        .xword 32b
+        .xword 16b
+        .xword  8b
+        .xword  4b
+	.popsection
 endfunc

 // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -306,11 +312,11 @@ endfunc
 //                              const int max_width, const int max_height);
 function ipred_dc_top_16bpc_neon, export=1
         clz             w3,  w3
-        adr             x5,  L(ipred_dc_top_tbl)
+        adrp            x5,  L(ipred_dc_top_tbl)
+        add             x5,  x5, :lo12: L(ipred_dc_top_tbl)
         sub             w3,  w3,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
+        ldr             x5,  [x5, w3, uxtw #3]
         add             x2,  x2,  #2
-        sub             x5,  x5,  w3, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -409,12 +415,14 @@ function ipred_dc_top_16bpc_neon, export=1
         b.gt            64b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_dc_top_tbl):
-        .hword L(ipred_dc_top_tbl) - 640b
-        .hword L(ipred_dc_top_tbl) - 320b
-        .hword L(ipred_dc_top_tbl) - 160b
-        .hword L(ipred_dc_top_tbl) -  80b
-        .hword L(ipred_dc_top_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -425,13 +433,12 @@ function ipred_dc_left_16bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw #1
         clz             w3,  w3
         clz             w7,  w4
-        adr             x5,  L(ipred_dc_left_tbl)
+        adrp            x5,  L(ipred_dc_left_tbl)
+        add             x5,  x5, :lo12: L(ipred_dc_left_tbl)
         sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
         sub             w7,  w7,  #25
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w7,  [x5, w7, uxtw #1]
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w7, uxtw
+        ldr             x3,  [x5, w3, uxtw #3]
+        ldr             x5,  [x5, w7, uxtw #3]
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -550,17 +557,19 @@ L(ipred_dc_left_w64):
         b.gt            1b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_dc_left_tbl):
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
-        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
+        .xword L(ipred_dc_left_h64)
+        .xword L(ipred_dc_left_h32)
+        .xword L(ipred_dc_left_h16)
+        .xword L(ipred_dc_left_h8)
+        .xword L(ipred_dc_left_h4)
+        .xword L(ipred_dc_left_w64)
+        .xword L(ipred_dc_left_w32)
+        .xword L(ipred_dc_left_w16)
+        .xword L(ipred_dc_left_w8)
+        .xword L(ipred_dc_left_w4)
+	.popsection
 endfunc

 // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -573,16 +582,15 @@ function ipred_dc_16bpc_neon, export=1
         clz             w3,  w3
         clz             w6,  w4
         dup             v16.4s, w7               // width + height
-        adr             x5,  L(ipred_dc_tbl)
+        adrp            x5,  L(ipred_dc_tbl)
+        add             x5,  x5, :lo12: L(ipred_dc_tbl)
         rbit            w7,  w7                  // rbit(width + height)
         sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
         sub             w6,  w6,  #25
         clz             w7,  w7                  // ctz(width + height)
-        ldrh            w3,  [x5, w3, uxtw #1]
-        ldrh            w6,  [x5, w6, uxtw #1]
+        ldr             x3,  [x5, w3, uxtw #3]
+        ldr             x5,  [x5, w6, uxtw #3]
         neg             w7,  w7                  // -ctz(width + height)
-        sub             x3,  x5,  w3, uxtw
-        sub             x5,  x5,  w6, uxtw
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w7              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -795,17 +803,19 @@ L(ipred_dc_w64):
         b.gt            2b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_dc_tbl):
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
-        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
+        .xword L(ipred_dc_h64)
+        .xword L(ipred_dc_h32)
+        .xword L(ipred_dc_h16)
+        .xword L(ipred_dc_h8)
+        .xword L(ipred_dc_h4)
+        .xword L(ipred_dc_w64)
+        .xword L(ipred_dc_w32)
+        .xword L(ipred_dc_w16)
+        .xword L(ipred_dc_w8)
+        .xword L(ipred_dc_w4)
+	.popsection
 endfunc

 // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -814,13 +824,13 @@ endfunc
 //                             const int max_width, const int max_height);
 function ipred_paeth_16bpc_neon, export=1
         clz             w9,  w3
-        adr             x5,  L(ipred_paeth_tbl)
+        adrp            x5,  L(ipred_paeth_tbl)
+        add             x5,  x5, :lo12: L(ipred_paeth_tbl)
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldr             x5,  [x5, w9, uxtw #3]
         ld1r            {v4.8h},  [x2]
         add             x8,  x2,  #2
         sub             x2,  x2,  #8
-        sub             x5,  x5,  w9, uxtw
         mov             x7,  #-8
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
@@ -934,12 +944,14 @@ function ipred_paeth_16bpc_neon, export=1
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_paeth_tbl):
-        .hword L(ipred_paeth_tbl) - 640b
-        .hword L(ipred_paeth_tbl) - 320b
-        .hword L(ipred_paeth_tbl) - 160b
-        .hword L(ipred_paeth_tbl) -  80b
-        .hword L(ipred_paeth_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -951,13 +963,13 @@ function ipred_smooth_16bpc_neon, export=1
         add             x11, x10, w4, uxtw
         add             x10, x10, w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_tbl)
+        adrp            x5,  L(ipred_smooth_tbl)
+        add             x5,  x5, :lo12: L(ipred_smooth_tbl)
         sub             x12, x2,  w4, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldr             x5,  [x5, w9, uxtw #3]
         ld1r            {v4.8h},  [x12] // bottom
         add             x8,  x2,  #2
-        sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1138,12 +1150,14 @@ function ipred_smooth_16bpc_neon, export=1
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_smooth_tbl):
-        .hword L(ipred_smooth_tbl) - 640b
-        .hword L(ipred_smooth_tbl) - 320b
-        .hword L(ipred_smooth_tbl) - 160b
-        .hword L(ipred_smooth_tbl) -  80b
-        .hword L(ipred_smooth_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -1154,13 +1168,13 @@ function ipred_smooth_v_16bpc_neon, export=1
         movrel          x7,  X(sm_weights)
         add             x7,  x7,  w4, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_v_tbl)
+        adrp            x5,  L(ipred_smooth_v_tbl)
+        add             x5,  x5, :lo12: L(ipred_smooth_v_tbl)
         sub             x8,  x2,  w4, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldr             x5,  [x5, w9, uxtw #3]
         ld1r            {v4.8h},  [x8] // bottom
         add             x2,  x2,  #2
-        sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1265,12 +1279,14 @@ function ipred_smooth_v_16bpc_neon, export=1
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_smooth_v_tbl):
-        .hword L(ipred_smooth_v_tbl) - 640b
-        .hword L(ipred_smooth_v_tbl) - 320b
-        .hword L(ipred_smooth_v_tbl) - 160b
-        .hword L(ipred_smooth_v_tbl) -  80b
-        .hword L(ipred_smooth_v_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -1281,12 +1297,12 @@ function ipred_smooth_h_16bpc_neon, export=1
         movrel          x8,  X(sm_weights)
         add             x8,  x8,  w3, uxtw
         clz             w9,  w3
-        adr             x5,  L(ipred_smooth_h_tbl)
+        adrp            x5,  L(ipred_smooth_h_tbl)
+        add             x5,  x5, :lo12: L(ipred_smooth_h_tbl)
         add             x12, x2,  w3, uxtw #1
         sub             w9,  w9,  #25
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldr             x5,  [x5, w9, uxtw #3]
         ld1r            {v5.8h},  [x12] // right
-        sub             x5,  x5,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         br              x5
@@ -1397,12 +1413,14 @@ function ipred_smooth_h_16bpc_neon, export=1
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_smooth_h_tbl):
-        .hword L(ipred_smooth_h_tbl) - 640b
-        .hword L(ipred_smooth_h_tbl) - 320b
-        .hword L(ipred_smooth_h_tbl) - 160b
-        .hword L(ipred_smooth_h_tbl) -  80b
-        .hword L(ipred_smooth_h_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 const padding_mask_buf
@@ -1728,11 +1746,11 @@ endfunc
 //                                const int dx, const int max_base_x);
 function ipred_z1_fill1_16bpc_neon, export=1
         clz             w9,  w3
-        adr             x8,  L(ipred_z1_fill1_tbl)
+        adrp            x8,  L(ipred_z1_fill1_tbl)
+        add             x8,  x8, :lo12: L(ipred_z1_fill1_tbl)
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldr             x8,  [x8, w9, uxtw #3]
         add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
-        sub             x8,  x8,  w9,  uxtw
         ld1r            {v31.8h}, [x10]           // padding
         mov             w7,  w5
         mov             w15, #64
@@ -1917,12 +1935,14 @@ function ipred_z1_fill1_16bpc_neon, export=1
         mov             w3,  w12
         b               169b

+	.pushsection .data.rel.ro, "aw"
 L(ipred_z1_fill1_tbl):
-        .hword L(ipred_z1_fill1_tbl) - 640b
-        .hword L(ipred_z1_fill1_tbl) - 320b
-        .hword L(ipred_z1_fill1_tbl) - 160b
-        .hword L(ipred_z1_fill1_tbl) -  80b
-        .hword L(ipred_z1_fill1_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 function ipred_z1_fill2_16bpc_neon, export=1
@@ -2050,11 +2070,11 @@ endconst
 //                                const int dx, const int dy);
 function ipred_z2_fill1_16bpc_neon, export=1
         clz             w10, w4
-        adr             x9,  L(ipred_z2_fill1_tbl)
+        adrp            x9,  L(ipred_z2_fill1_tbl)
+        add             x9,  x9, :lo12: L(ipred_z2_fill1_tbl)
         sub             w10, w10, #25
-        ldrh            w10, [x9, w10, uxtw #1]
+        ldr             x9, [x9, w10, uxtw #3]
         mov             w8,  #(1 << 6)            // xpos = 1 << 6
-        sub             x9,  x9,  w10, uxtw
         sub             w8,  w8,  w6              // xpos -= dx

         movrel          x11, increments
@@ -2815,12 +2835,14 @@ function ipred_z2_fill1_16bpc_neon, export=1
         ldp             d8,  d9,  [sp], 0x40
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_z2_fill1_tbl):
-        .hword L(ipred_z2_fill1_tbl) - 640b
-        .hword L(ipred_z2_fill1_tbl) - 320b
-        .hword L(ipred_z2_fill1_tbl) - 160b
-        .hword L(ipred_z2_fill1_tbl) -  80b
-        .hword L(ipred_z2_fill1_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 function ipred_z2_fill2_16bpc_neon, export=1
@@ -3432,11 +3454,11 @@ endfunc
 //                                const int dy, const int max_base_y);
 function ipred_z3_fill1_16bpc_neon, export=1
         clz             w9,  w4
-        adr             x8,  L(ipred_z3_fill1_tbl)
+        adrp            x8,  L(ipred_z3_fill1_tbl)
+        add             x8,  x8, :lo12: L(ipred_z3_fill1_tbl)
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
+        ldr             x8,  [x8, w9, uxtw #3]
         add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
-        sub             x8,  x8,  w9,  uxtw
         ld1r            {v31.8h}, [x10]           // padding
         mov             w7,  w5
         mov             w15, #64
@@ -3638,17 +3660,20 @@ function ipred_z3_fill1_16bpc_neon, export=1
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_z3_fill1_tbl):
-        .hword L(ipred_z3_fill1_tbl) - 640b
-        .hword L(ipred_z3_fill1_tbl) - 320b
-        .hword L(ipred_z3_fill1_tbl) - 160b
-        .hword L(ipred_z3_fill1_tbl) -  80b
-        .hword L(ipred_z3_fill1_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 function ipred_z3_fill_padding_neon, export=0
         cmp             w3,  #8
-        adr             x8,  L(ipred_z3_fill_padding_tbl)
+        adrp            x8,  L(ipred_z3_fill_padding_tbl)
+        add             x8,  x8, :lo12: L(ipred_z3_fill_padding_tbl)
         b.gt            L(ipred_z3_fill_padding_wide)
         // w3 = remaining width, w4 = constant height
         mov             w12, w4
@@ -3659,10 +3684,11 @@ function ipred_z3_fill_padding_neon, export=0
         // power of two in the remaining width, and repeating.
         clz             w9,  w3
         sub             w9,  w9,  #25
-        ldrh            w9,  [x8, w9, uxtw #1]
-        sub             x9,  x8,  w9,  uxtw
+        ldr             x9,  [x8, w9, uxtw #3]
         br              x9

+20:
+        AARCH64_VALID_JUMP_TARGET
 2:
         st1             {v31.s}[0], [x0],  x1
         subs            w4,  w4,  #4
@@ -3681,6 +3707,8 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b

+40:
+        AARCH64_VALID_JUMP_TARGET
 4:
         st1             {v31.4h}, [x0],  x1
         subs            w4,  w4,  #4
@@ -3699,10 +3727,11 @@ function ipred_z3_fill_padding_neon, export=0
         mov             w4,  w12
         b               1b

-8:
-16:
-32:
-64:
+80:
+160:
+320:
+640:
+        AARCH64_VALID_JUMP_TARGET
         st1             {v31.8h}, [x0],  x1
         subs            w4,  w4,  #4
         st1             {v31.8h}, [x13], x1
@@ -3723,13 +3752,15 @@ function ipred_z3_fill_padding_neon, export=0
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_z3_fill_padding_tbl):
-        .hword L(ipred_z3_fill_padding_tbl) - 64b
-        .hword L(ipred_z3_fill_padding_tbl) - 32b
-        .hword L(ipred_z3_fill_padding_tbl) - 16b
-        .hword L(ipred_z3_fill_padding_tbl) -  8b
-        .hword L(ipred_z3_fill_padding_tbl) -  4b
-        .hword L(ipred_z3_fill_padding_tbl) -  2b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+        .xword  20b
+	.popsection

 L(ipred_z3_fill_padding_wide):
         // Fill a WxH rectangle with padding, with W > 8.
@@ -3880,13 +3911,13 @@ function ipred_filter_\bpc\()bpc_neon
         add             x6,  x6,  w5, uxtw
         ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
         clz             w9,  w3
-        adr             x5,  L(ipred_filter\bpc\()_tbl)
+        adrp            x5,  L(ipred_filter\bpc\()_tbl)
+        add             x5,  x5, :lo12: L(ipred_filter\bpc\()_tbl)
         ld1             {v20.8b, v21.8b, v22.8b}, [x6]
         sub             w9,  w9,  #26
-        ldrh            w9,  [x5, w9, uxtw #1]
+        ldr             x5,  [x5, w9, uxtw #3]
         sxtl            v16.8h,  v16.8b
         sxtl            v17.8h,  v17.8b
-        sub             x5,  x5,  w9, uxtw
         sxtl            v18.8h,  v18.8b
         sxtl            v19.8h,  v19.8b
         add             x6,  x0,  x1
@@ -4160,11 +4191,13 @@ function ipred_filter_\bpc\()bpc_neon
 9:
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_filter\bpc\()_tbl):
-        .hword L(ipred_filter\bpc\()_tbl) - 320b
-        .hword L(ipred_filter\bpc\()_tbl) - 160b
-        .hword L(ipred_filter\bpc\()_tbl) -  80b
-        .hword L(ipred_filter\bpc\()_tbl) -  40b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc
 .endm

@@ -4184,11 +4217,11 @@ endfunc
 function pal_pred_16bpc_neon, export=1
         ld1             {v30.8h}, [x2]
         clz             w9,  w4
-        adr             x6,  L(pal_pred_tbl)
+        adrp            x6,  L(pal_pred_tbl)
+        add             x6,  x6, :lo12: L(pal_pred_tbl)
         sub             w9,  w9,  #25
-        ldrh            w9,  [x6, w9, uxtw #1]
+        ldr             x6,  [x6, w9, uxtw #3]
         movi            v31.8h,  #1, lsl #8
-        sub             x6,  x6,  w9, uxtw
         br              x6
 40:
         AARCH64_VALID_JUMP_TARGET
@@ -4357,12 +4390,14 @@ function pal_pred_16bpc_neon, export=1
         b.gt            64b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(pal_pred_tbl):
-        .hword L(pal_pred_tbl) - 640b
-        .hword L(pal_pred_tbl) - 320b
-        .hword L(pal_pred_tbl) - 160b
-        .hword L(pal_pred_tbl) -  80b
-        .hword L(pal_pred_tbl) -  40b
+        .xword 640b
+        .xword 320b
+        .xword 160b
+        .xword  80b
+        .xword  40b
+	.popsection
 endfunc

 // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4373,12 +4408,12 @@ endfunc
 function ipred_cfl_128_16bpc_neon, export=1
         dup             v31.8h,  w7   // bitdepth_max
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_128_tbl)
+        adrp            x7,  L(ipred_cfl_128_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_128_tbl)
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldr             x7,  [x7, w9, uxtw #3]
         urshr           v0.8h,   v31.8h,  #1
         dup             v1.8h,   w6   // alpha
-        sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
@@ -4510,12 +4545,14 @@ L(ipred_cfl_splat_w16):
         b.gt            1b
         ret

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_128_tbl):
 L(ipred_cfl_splat_tbl):
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
-        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
+        .xword L(ipred_cfl_splat_w16)
+        .xword L(ipred_cfl_splat_w16)
+        .xword L(ipred_cfl_splat_w8)
+        .xword L(ipred_cfl_splat_w4)
+	.popsection
 endfunc

 // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4526,12 +4563,12 @@ endfunc
 function ipred_cfl_top_16bpc_neon, export=1
         dup             v31.8h,  w7   // bitdepth_max
         clz             w9,  w3
-        adr             x7,  L(ipred_cfl_top_tbl)
+        adrp            x7,  L(ipred_cfl_top_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_top_tbl)
         sub             w9,  w9,  #26
-        ldrh            w9,  [x7, w9, uxtw #1]
+        ldr             x7,  [x7, w9, uxtw #3]
         dup             v1.8h,   w6   // alpha
         add             x2,  x2,  #2
-        sub             x7,  x7,  w9, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
@@ -4569,11 +4606,13 @@ function ipred_cfl_top_16bpc_neon, export=1
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_top_tbl):
-        .hword L(ipred_cfl_top_tbl) - 32b
-        .hword L(ipred_cfl_top_tbl) - 16b
-        .hword L(ipred_cfl_top_tbl) -  8b
-        .hword L(ipred_cfl_top_tbl) -  4b
+        .xword 32b
+        .xword 16b
+        .xword  8b
+        .xword  4b
+	.popsection
 endfunc

 // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4586,15 +4625,15 @@ function ipred_cfl_left_16bpc_neon, export=1
         sub             x2,  x2,  w4, uxtw #1
         clz             w9,  w3
         clz             w8,  w4
-        adr             x10, L(ipred_cfl_splat_tbl)
-        adr             x7,  L(ipred_cfl_left_tbl)
+        adrp            x10, L(ipred_cfl_splat_tbl)
+        add             x10, x10, :lo12: L(ipred_cfl_splat_tbl)
+        adrp            x7,  L(ipred_cfl_left_tbl)
+        add             x7, x7, :lo12: L(ipred_cfl_left_tbl)
         sub             w9,  w9,  #26
         sub             w8,  w8,  #26
-        ldrh            w9,  [x10, w9, uxtw #1]
-        ldrh            w8,  [x7,  w8, uxtw #1]
+        ldr             x9,  [x10, w9, uxtw #3]
+        ldr             x7,  [x7,  w8, uxtw #3]
         dup             v1.8h,   w6   // alpha
-        sub             x9,  x10, w9, uxtw
-        sub             x7,  x7,  w8, uxtw
         add             x6,  x0,  x1
         lsl             x1,  x1,  #1
         movi            v30.8h,  #0
@@ -4636,11 +4675,13 @@ L(ipred_cfl_left_h32):
         dup             v0.8h,   v0.h[0]
         br              x9

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_left_tbl):
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
-        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
+        .xword L(ipred_cfl_left_h32)
+        .xword L(ipred_cfl_left_h16)
+        .xword L(ipred_cfl_left_h8)
+        .xword L(ipred_cfl_left_h4)
+	.popsection
 endfunc

 // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
@@ -4656,16 +4697,15 @@ function ipred_cfl_16bpc_neon, export=1
         clz             w9,  w3
         clz             w6,  w4
         dup             v16.4s, w8               // width + height
-        adr             x7,  L(ipred_cfl_tbl)
+        adrp            x7,  L(ipred_cfl_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_tbl)
         rbit            w8,  w8                  // rbit(width + height)
         sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
         sub             w6,  w6,  #26
         clz             w8,  w8                  // ctz(width + height)
-        ldrh            w9,  [x7, w9, uxtw #1]
-        ldrh            w6,  [x7, w6, uxtw #1]
+        ldr             x9,  [x7, w9, uxtw #3]
+        ldr             x7,  [x7, w6, uxtw #3]
         neg             w8,  w8                  // -ctz(width + height)
-        sub             x9,  x7,  w9, uxtw
-        sub             x7,  x7,  w6, uxtw
         ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
         dup             v17.4s,  w8              // -ctz(width + height)
         add             x6,  x0,  x1
@@ -4789,15 +4829,17 @@ L(ipred_cfl_w32):
         dup             v0.8h,   v0.h[0]
         b               L(ipred_cfl_splat_w16)

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_tbl):
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
-        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
+        .xword L(ipred_cfl_h32)
+        .xword L(ipred_cfl_h16)
+        .xword L(ipred_cfl_h8)
+        .xword L(ipred_cfl_h4)
+        .xword L(ipred_cfl_w32)
+        .xword L(ipred_cfl_w16)
+        .xword L(ipred_cfl_w8)
+        .xword L(ipred_cfl_w4)
+	.popsection
 endfunc

 // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -4806,14 +4848,14 @@ endfunc
 function ipred_cfl_ac_420_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_420_tbl)
+        adrp            x7,  L(ipred_cfl_ac_420_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_420_tbl)
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldr             x7,  [x7, w8, uxtw #3]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -4945,9 +4987,9 @@ L(ipred_cfl_ac_420_w8_hpad):

 L(ipred_cfl_ac_420_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        adrp            x7,  L(ipred_cfl_ac_420_w16_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_420_w16_tbl)
+        ldr             x7,  [x7, w3, uxtw #3]
         br              x7

 L(ipred_cfl_ac_420_w16_wpad0):
@@ -5124,17 +5166,19 @@ L(ipred_cfl_ac_420_w16_hpad):
         lsl             w6,  w6,  #2
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_ac_420_tbl):
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
-        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
-        .hword 0
+        .xword L(ipred_cfl_ac_420_w16)
+        .xword L(ipred_cfl_ac_420_w8)
+        .xword L(ipred_cfl_ac_420_w4)
+        .xword 0

 L(ipred_cfl_ac_420_w16_tbl):
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
-        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
+        .xword L(ipred_cfl_ac_420_w16_wpad0)
+        .xword L(ipred_cfl_ac_420_w16_wpad1)
+        .xword L(ipred_cfl_ac_420_w16_wpad2)
+        .xword L(ipred_cfl_ac_420_w16_wpad3)
+	.popsection
 endfunc

 // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -5143,14 +5187,14 @@ endfunc
 function ipred_cfl_ac_422_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_422_tbl)
+        adrp            x7,  L(ipred_cfl_ac_422_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_422_tbl)
         sub             w8,  w8,  #27
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldr             x7,  [x7, w8, uxtw #3]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -5251,9 +5295,9 @@ L(ipred_cfl_ac_422_w8_wpad):

 L(ipred_cfl_ac_422_w16):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
-        ldrh            w3,  [x7, w3, uxtw #1]
-        sub             x7,  x7,  w3, uxtw
+        adrp            x7,  L(ipred_cfl_ac_422_w16_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_422_w16_tbl)
+        ldr             x7,  [x7, w3, uxtw #3]
         br              x7

 L(ipred_cfl_ac_422_w16_wpad0):
@@ -5372,17 +5416,19 @@ L(ipred_cfl_ac_422_w16_wpad3):
         mov             v1.16b,  v3.16b
         b               L(ipred_cfl_ac_420_w16_hpad)

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_ac_422_tbl):
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
-        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
-        .hword 0
+        .xword L(ipred_cfl_ac_422_w16)
+        .xword L(ipred_cfl_ac_422_w8)
+        .xword L(ipred_cfl_ac_422_w4)
+        .xword 0

 L(ipred_cfl_ac_422_w16_tbl):
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
-        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
+        .xword L(ipred_cfl_ac_422_w16_wpad0)
+        .xword L(ipred_cfl_ac_422_w16_wpad1)
+        .xword L(ipred_cfl_ac_422_w16_wpad2)
+        .xword L(ipred_cfl_ac_422_w16_wpad3)
+	.popsection
 endfunc

 // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
@@ -5391,14 +5437,14 @@ endfunc
 function ipred_cfl_ac_444_16bpc_neon, export=1
         clz             w8,  w5
         lsl             w4,  w4,  #2
-        adr             x7,  L(ipred_cfl_ac_444_tbl)
+        adrp            x7,  L(ipred_cfl_ac_444_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_444_tbl)
         sub             w8,  w8,  #26
-        ldrh            w8,  [x7, w8, uxtw #1]
+        ldr             x7,  [x7, w8, uxtw #3]
         movi            v24.4s,  #0
         movi            v25.4s,  #0
         movi            v26.4s,  #0
         movi            v27.4s,  #0
-        sub             x7,  x7,  w8, uxtw
         sub             w8,  w6,  w4         // height - h_pad
         rbit            w9,  w5              // rbit(width)
         rbit            w10, w6              // rbit(height)
@@ -5507,10 +5553,11 @@ L(ipred_cfl_ac_444_w16_wpad):

 L(ipred_cfl_ac_444_w32):
         AARCH64_VALID_JUMP_TARGET
-        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
-        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
+        adrp            x7,  L(ipred_cfl_ac_444_w32_tbl)
+        add             x7,  x7, :lo12: L(ipred_cfl_ac_444_w32_tbl)
+        lsr             w3,  w3, #1
+        ldr             x7,  [x7, w3, uxtw #3] // (w3>>1) << 3
         lsr             x2,  x2,  #1 // Restore the stride to one line increments
-        sub             x7,  x7,  w3, uxtw
         br              x7

 L(ipred_cfl_ac_444_w32_wpad0):
@@ -5625,15 +5672,17 @@ L(ipred_cfl_ac_444_w32_hpad):
         lsl             w6,  w6,  #3
         b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

+	.pushsection .data.rel.ro, "aw"
 L(ipred_cfl_ac_444_tbl):
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
-        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
+        .xword L(ipred_cfl_ac_444_w32)
+        .xword L(ipred_cfl_ac_444_w16)
+        .xword L(ipred_cfl_ac_444_w8)
+        .xword L(ipred_cfl_ac_444_w4)

 L(ipred_cfl_ac_444_w32_tbl):
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
-        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
+        .xword L(ipred_cfl_ac_444_w32_wpad0)
+        .xword L(ipred_cfl_ac_444_w32_wpad2)
+        .xword L(ipred_cfl_ac_444_w32_wpad4)
+        .xword L(ipred_cfl_ac_444_w32_wpad6)
+	.popsection
 endfunc