Index: src/x86/itx_avx512.asm --- src/x86/itx_avx512.asm.orig +++ src/x86/itx_avx512.asm @@ -475,6 +475,7 @@ cglobal idct_4x4_internal_8bpc, 0, 6, 0, dst, stride, pshufb m1, m3, m2 jmp tx2q .pass2: + _CET_ENDBR IDCT4_1D_PACKED pxor ymm16, ymm16 mova [cq], ymm16 @@ -495,6 +496,7 @@ cglobal iadst_4x4_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m3 jmp tx2q .pass2: + _CET_ENDBR call .main .end: pxor ymm16, ymm16 @@ -521,6 +523,7 @@ cglobal iflipadst_4x4_internal_8bpc, 0, 6, 0, dst, str punpckhwd m1, m2 jmp tx2q .pass2: + _CET_ENDBR call m(iadst_4x4_internal_8bpc).main .end: pxor ymm16, ymm16 @@ -547,6 +550,7 @@ cglobal iidentity_4x4_internal_8bpc, 0, 6, 0, dst, str punpcklwd m0, m2 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 @@ -693,6 +697,7 @@ cglobal idct_4x8_internal_8bpc, 0, 6, 0, dst, stride, pshufb m1, m3, m2 jmp tx2q .pass2: + _CET_ENDBR vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 call .main @@ -724,6 +729,7 @@ cglobal iadst_4x8_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m3 jmp tx2q .pass2: + _CET_ENDBR vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 @@ -787,6 +793,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 6, 0, dst, str punpckhwd m1, m3 jmp tx2q .pass2: + _CET_ENDBR vextracti32x4 xm2, m0, 1 vextracti32x4 xm3, m1, 1 pshufd xm4, xm0, q1032 @@ -818,6 +825,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 6, 0, dst, str vextracti32x8 ym1, m0, 1 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd ym4, [o(pw_4096)] jmp m(iadst_4x8_internal_8bpc).end2 @@ -935,6 +943,7 @@ cglobal idct_4x16_internal_8bpc, 0, 6, 0, dst, stride, pmulhrsw m1, m4 jmp tx2q .pass2: + _CET_ENDBR vextracti32x4 xm2, ym0, 1 vextracti32x4 xm3, ym1, 1 vextracti32x4 xm4, m0, 2 @@ -975,6 +984,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 6, 0, dst, stride punpcklwd m0, m2 jmp tx2q .pass2: + _CET_ENDBR call .main vpbroadcastd m5, [o(pw_2048)] psrlq m10, 4 @@ -1082,6 +1092,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 6, 0, dst, st punpckhwd m1, m2 jmp tx2q .pass2: + _CET_ENDBR call m(iadst_4x16_internal_8bpc).main vpbroadcastd m6, [o(pw_2048)] psrlq m10, 12 @@ -1109,6 +1120,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 6, 0, dst, st punpckhdq m1, m2 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m3, [o(pw_1697x16)] vpbroadcastd m5, [o(pw_2048)] pmulhrsw m2, m3, m0 @@ -1181,6 +1193,7 @@ cglobal idct_8x4_internal_8bpc, 0, 6, 0, dst, stride, pshufb m1, m4 jmp tx2q .pass2: + _CET_ENDBR IDCT4_1D_PACKED vpermq m0, m0, q3120 vpermq m1, m1, q2031 @@ -1210,6 +1223,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m3 jmp tx2q .pass2: + _CET_ENDBR call .main .end: vpermq m0, m0, q3120 @@ -1253,6 +1267,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 6, 0, dst, str punpcklwd m0, m3 jmp tx2q .pass2: + _CET_ENDBR call m(iadst_8x4_internal_8bpc).main mova m2, m1 vpermq m1, m0, q2031 @@ -1280,6 +1295,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 6, 0, dst, str paddsw m1, m1 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 @@ -1349,6 +1365,7 @@ cglobal idct_8x8_internal_8bpc, 0, 6, 0, dst, stride, vshufi32x4 m3, m5, m3, 0x03 jmp tx2q .pass2: + _CET_ENDBR call .main vpbroadcastd m4, [o(pw_2048)] vpermq m0, m0, q3120 @@ -1388,6 +1405,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 6, 0, dst, stride, vinserti32x4 m1, m4, xm1, 1 jmp tx2q .pass2: + _CET_ENDBR pshufd m4, m0, q1032 pshufd m5, m1, q1032 call .main_pass2 @@ -1455,6 +1473,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 6, 0, dst, str vshufi32x4 m2, m4, m2, 0x03 jmp tx2q .pass2: + _CET_ENDBR pshufd m4, m0, q1032 pshufd m5, m1, q1032 call m(iadst_8x8_internal_8bpc).main_pass2 @@ -1493,6 +1512,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 6, 0, dst, str punpckhdq m3, m4 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m4, [o(pw_4096)] jmp m(iadst_8x8_internal_8bpc).end @@ -1553,6 +1573,7 @@ cglobal idct_8x16_internal_8bpc, 0, 6, 0, dst, stride, punpckhdq m3, m4 ; 3 7 11 15 jmp tx2q .pass2: + _CET_ENDBR vprord m5, [o(int16_perm)], 16 vshufi32x4 m2, m2, q1320 ; 2 10 14 6 vshufi32x4 m4, m1, m3, q2310 ; 1 5 15 11 @@ -1686,6 +1707,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 6, 0, dst, stride punpckhqdq m3, m5 jmp tx2q .pass2: + _CET_ENDBR call .main_pass2 vpbroadcastd m6, [o(pw_2048)] psrlq m10, 4 @@ -1794,6 +1816,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 6, 0, dst, st pshufb m2, m1, m6 ; e0 f0 e1 f1 e2 f2 e3 f3 jmp m(iadst_8x16_internal_8bpc).pass1_end .pass2: + _CET_ENDBR call m(iadst_8x16_internal_8bpc).main_pass2 vpbroadcastd m7, [o(pw_2048)] psrlq m10, 36 @@ -1823,6 +1846,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 6, 0, dst, st punpckhqdq m3, m4 ; a3 b3 c3 d3 e3 f3 g3 h3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m7, [o(pw_1697x16)] mova ym8, [o(gather8b)] lea r3, [dstq+strideq*2] @@ -1897,6 +1921,7 @@ cglobal idct_16x4_internal_8bpc, 0, 6, 0, dst, stride, punpcklwd m0, m2 jmp tx2q .pass2: + _CET_ENDBR IDCT4_1D_PACKED mova m2, [o(permA)] jmp m(iadst_16x4_internal_8bpc).end @@ -1936,6 +1961,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 6, 0, dst, stride pmulhrsw m1, m6 jmp tx2q .pass2: + _CET_ENDBR call .main movu m2, [o(permA+1)] .end: @@ -1986,6 +2012,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 6, 0, dst, st psrlq m10, 16 jmp m(iadst_16x4_internal_8bpc).pass1_end .pass2: + _CET_ENDBR call m(iadst_16x4_internal_8bpc).main movu m2, [o(permA+2)] jmp m(iadst_16x4_internal_8bpc).end @@ -2013,6 +2040,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 6, 0, dst, st vpermb m1, m5, m1 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m3, [o(pw_1697x8)] pmulhrsw m2, m3, m0 pmulhrsw m3, m1 @@ -2112,6 +2140,7 @@ cglobal idct_16x8_internal_8bpc, 0, 6, 0, dst, stride, punpckhdq m5, m6, m7 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp tx2q .pass2: + _CET_ENDBR vshufi32x4 m0, m2, m4, q2020 ; 0 1 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 ; 2 3 @@ -2211,6 +2240,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 6, 0, dst, stride REPX {pmulhrsw x, m7}, m2, m3, m4, m5 jmp tx2q .pass2: + _CET_ENDBR vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 @@ -2265,6 +2295,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 6, 0, dst, st psrlq m10, 20 jmp m(iadst_16x8_internal_8bpc).pass1_end .pass2: + _CET_ENDBR vshufi32x4 m0, m2, m4, q2020 vshufi32x4 m2, m4, q3131 ; 4 5 vshufi32x4 m1, m3, m5, q2020 @@ -2314,6 +2345,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 6, 0, dst, st REPX {vpermb x, m9, x}, m2, m3, m4, m5 jmp tx2q .pass2: + _CET_ENDBR mova m7, [o(permB)] vpbroadcastd m6, [o(pw_4096)] vpermq m0, m7, m2 @@ -2373,6 +2405,7 @@ cglobal idct_16x16_internal_8bpc, 0, 6, 0, dst, stride punpckldq m6, m11 jmp tx2q .pass2: + _CET_ENDBR vshufi32x4 m8, m4, m6, q3232 ; i8 ic m8 mc vinserti32x8 m4, ym6, 1 ; i0 i4 m0 m4 vshufi32x4 m6, m0, m2, q3232 ; a8 ac e8 ec @@ -2538,6 +2571,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 6, 0, dst, strid REPX {pmulhrsw x, m10}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: + _CET_ENDBR call .main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 @@ -2720,6 +2754,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 6, 0, dst, s punpckhwd m5, m8, m9 ; i2 j2 k2 l2 i3 j3 k3 l3 jmp m(iadst_16x16_internal_8bpc).pass1_end .pass2: + _CET_ENDBR call m(iadst_16x16_internal_8bpc).main_pass2 mova m10, [o(permD)] psrlq m8, m10, 8 @@ -2789,6 +2824,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 6, 0, dst, s jmp tx2q ALIGN function_align .pass2: + _CET_ENDBR vpbroadcastd m11, [o(pw_1697x16)] pmulhrsw m12, m11, m0 pmulhrsw m13, m11, m1 @@ -3131,6 +3167,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst, call m(idct_8x16_internal_8bpc).main call m(inv_txfm_add_dct_dct_8x32_8bpc).main_fast .pass2: + _CET_ENDBR vpbroadcastd m10, [o(pw_8192)] vpermt2q m0, m15, m4 ; t0 t1 t9 t8 vpermt2q m20, m15, m18 ; t31 t30a t23a t22 @@ -3586,6 +3623,7 @@ cglobal inv_txfm_add_dct_dct_16x32_8bpc, 4, 4, 22, dst punpckhwd m17, m17 call .main_oddhalf_fast .pass2: + _CET_ENDBR vpbroadcastd m10, [o(pw_2048)] mova m11, [o(end_16x32p)] lea r3, [strideq*3] @@ -3798,6 +3836,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 6, 22, dst punpckhwd m17, m17 ; 15 call m(inv_txfm_add_dct_dct_16x32_8bpc).main_oddhalf_fast .pass2: + _CET_ENDBR vpbroadcastd m9, [o(pw_16384)] call .transpose_round vshufi32x4 m16, m14, m2, q3131 ; 5 @@ -5683,6 +5722,7 @@ ALIGN function_align vinserti32x8 m17, ym21, 1 ; c30 c31 d30 d31 ret .pass2: + _CET_ENDBR vshufi32x4 m7, m5, m19, q3131 ; 14 vshufi32x4 m5, m19, q2020 ; 10 vshufi32x4 m21, m6, m20, q3131 ; 15