ports/multimedia/dav1d/patches/patch-src_x86_itx_avx512_asm

323 lines
11 KiB
Text

Index: src/x86/itx_avx512.asm
--- src/x86/itx_avx512.asm.orig
+++ src/x86/itx_avx512.asm
@@ -475,6 +475,7 @@ cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride,
pshufb m1, m3, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
IDCT4_1D_PACKED
pxor ymm16, ymm16
mova [cq], ymm16
@@ -495,6 +496,7 @@ cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main
.end:
pxor ymm16, ymm16
@@ -521,6 +523,7 @@ cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, str
punpckhwd m1, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
call m(iadst_4x4_internal_8bpc).main
.end:
pxor ymm16, ymm16
@@ -547,6 +550,7 @@ cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, str
punpcklwd m0, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
@@ -693,6 +697,7 @@ cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride,
pshufb m1, m3, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
vextracti32x4 xm2, m0, 1
vextracti32x4 xm3, m1, 1
call .main
@@ -724,6 +729,7 @@ cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
vextracti32x4 xm2, m0, 1
vextracti32x4 xm3, m1, 1
pshufd xm4, xm0, q1032
@@ -787,6 +793,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, str
punpckhwd m1, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
vextracti32x4 xm2, m0, 1
vextracti32x4 xm3, m1, 1
pshufd xm4, xm0, q1032
@@ -818,6 +825,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str
vextracti32x8 ym1, m0, 1
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd ym4, [o(pw_4096)]
jmp m(iadst_4x8_internal_8bpc).end2
@@ -935,6 +943,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride,
pmulhrsw m1, m4
jmp tx2q
.pass2:
+ _CET_ENDBR
vextracti32x4 xm2, ym0, 1
vextracti32x4 xm3, ym1, 1
vextracti32x4 xm4, m0, 2
@@ -975,6 +984,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride
punpcklwd m0, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main
vpbroadcastd m5, [o(pw_2048)]
psrlq m10, 4
@@ -1082,6 +1092,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st
punpckhwd m1, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
call m(iadst_4x16_internal_8bpc).main
vpbroadcastd m6, [o(pw_2048)]
psrlq m10, 12
@@ -1109,6 +1120,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st
punpckhdq m1, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m3, [o(pw_1697x16)]
vpbroadcastd m5, [o(pw_2048)]
pmulhrsw m2, m3, m0
@@ -1181,6 +1193,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride,
pshufb m1, m4
jmp tx2q
.pass2:
+ _CET_ENDBR
IDCT4_1D_PACKED
vpermq m0, m0, q3120
vpermq m1, m1, q2031
@@ -1210,6 +1223,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main
.end:
vpermq m0, m0, q3120
@@ -1253,6 +1267,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str
punpcklwd m0, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
call m(iadst_8x4_internal_8bpc).main
mova m2, m1
vpermq m1, m0, q2031
@@ -1280,6 +1295,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str
paddsw m1, m1
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
@@ -1349,6 +1365,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride,
vshufi32x4 m3, m5, m3, 0x03
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main
vpbroadcastd m4, [o(pw_2048)]
vpermq m0, m0, q3120
@@ -1388,6 +1405,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride,
vinserti32x4 m1, m4, xm1, 1
jmp tx2q
.pass2:
+ _CET_ENDBR
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call .main_pass2
@@ -1455,6 +1473,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str
vshufi32x4 m2, m4, m2, 0x03
jmp tx2q
.pass2:
+ _CET_ENDBR
pshufd m4, m0, q1032
pshufd m5, m1, q1032
call m(iadst_8x8_internal_8bpc).main_pass2
@@ -1493,6 +1512,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str
punpckhdq m3, m4
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m4, [o(pw_4096)]
jmp m(iadst_8x8_internal_8bpc).end
@@ -1553,6 +1573,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride,
punpckhdq m3, m4 ; 3 7 11 15
jmp tx2q
.pass2:
+ _CET_ENDBR
vprord m5, [o(int16_perm)], 16
vshufi32x4 m2, m2, q1320 ; 2 10 14 6
vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11
@@ -1686,6 +1707,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride
punpckhqdq m3, m5
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main_pass2
vpbroadcastd m6, [o(pw_2048)]
psrlq m10, 4
@@ -1794,6 +1816,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st
pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3
jmp m(iadst_8x16_internal_8bpc).pass1_end
.pass2:
+ _CET_ENDBR
call m(iadst_8x16_internal_8bpc).main_pass2
vpbroadcastd m7, [o(pw_2048)]
psrlq m10, 36
@@ -1823,6 +1846,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st
punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m7, [o(pw_1697x16)]
mova ym8, [o(gather8b)]
lea r3, [dstq+strideq*2]
@@ -1897,6 +1921,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride,
punpcklwd m0, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
IDCT4_1D_PACKED
mova m2, [o(permA)]
jmp m(iadst_16x4_internal_8bpc).end
@@ -1936,6 +1961,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride
pmulhrsw m1, m6
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main
movu m2, [o(permA+1)]
.end:
@@ -1986,6 +2012,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st
psrlq m10, 16
jmp m(iadst_16x4_internal_8bpc).pass1_end
.pass2:
+ _CET_ENDBR
call m(iadst_16x4_internal_8bpc).main
movu m2, [o(permA+2)]
jmp m(iadst_16x4_internal_8bpc).end
@@ -2013,6 +2040,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st
vpermb m1, m5, m1
jmp tx2q
.pass2:
+ _CET_ENDBR
vpbroadcastd m3, [o(pw_1697x8)]
pmulhrsw m2, m3, m0
pmulhrsw m3, m1
@@ -2112,6 +2140,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride,
punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3
jmp tx2q
.pass2:
+ _CET_ENDBR
vshufi32x4 m0, m2, m4, q2020 ; 0 1
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020 ; 2 3
@@ -2211,6 +2240,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride
REPX {pmulhrsw x, m7}, m2, m3, m4, m5
jmp tx2q
.pass2:
+ _CET_ENDBR
vshufi32x4 m0, m2, m4, q2020
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020
@@ -2265,6 +2295,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st
psrlq m10, 20
jmp m(iadst_16x8_internal_8bpc).pass1_end
.pass2:
+ _CET_ENDBR
vshufi32x4 m0, m2, m4, q2020
vshufi32x4 m2, m4, q3131 ; 4 5
vshufi32x4 m1, m3, m5, q2020
@@ -2314,6 +2345,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st
REPX {vpermb x, m9, x}, m2, m3, m4, m5
jmp tx2q
.pass2:
+ _CET_ENDBR
mova m7, [o(permB)]
vpbroadcastd m6, [o(pw_4096)]
vpermq m0, m7, m2
@@ -2373,6 +2405,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride
punpckldq m6, m11
jmp tx2q
.pass2:
+ _CET_ENDBR
vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc
vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4
vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec
@@ -2538,6 +2571,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid
REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main_pass2
mova m10, [o(permD)]
psrlq m8, m10, 8
@@ -2720,6 +2754,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s
punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3
jmp m(iadst_16x16_internal_8bpc).pass1_end
.pass2:
+ _CET_ENDBR
call m(iadst_16x16_internal_8bpc).main_pass2
mova m10, [o(permD)]
psrlq m8, m10, 8
@@ -2789,6 +2824,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s
jmp tx2q
ALIGN function_align
.pass2:
+ _CET_ENDBR
vpbroadcastd m11, [o(pw_1697x16)]
pmulhrsw m12, m11, m0
pmulhrsw m13, m11, m1
@@ -3131,6 +3167,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst,
call m(idct_8x16_internal_8bpc).main
call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast
.pass2:
+ _CET_ENDBR
vpbroadcastd m10, [o(pw_8192)]
vpermt2q m0, m15, m4 ; t0 t1 t9 t8
vpermt2q m20, m15, m18 ; t31 t30a t23a t22
@@ -3586,6 +3623,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst
punpckhwd m17, m17
call .main_oddhalf_fast
.pass2:
+ _CET_ENDBR
vpbroadcastd m10, [o(pw_2048)]
mova m11, [o(end_16x32p)]
lea r3, [strideq*3]
@@ -3798,6 +3836,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst
punpckhwd m17, m17 ; 15
call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast
.pass2:
+ _CET_ENDBR
vpbroadcastd m9, [o(pw_16384)]
call .transpose_round
vshufi32x4 m16, m14, m2, q3131 ; 5
@@ -5683,6 +5722,7 @@ ALIGN function_align
vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31
ret
.pass2:
+ _CET_ENDBR
vshufi32x4 m7, m5, m19, q3131 ; 14
vshufi32x4 m5, m19, q2020 ; 10
vshufi32x4 m21, m6, m20, q3131 ; 15