Index: src/x86/itx16_avx2.asm --- src/x86/itx16_avx2.asm.orig +++ src/x86/itx16_avx2.asm @@ -360,6 +360,7 @@ cglobal idct_4x4_internal_10bpc, 0, 7, 6, dst, stride, pshufb m0, m2 jmp tx2q .pass2: + _CET_ENDBR vextracti128 xm1, m0, 1 WRAP_XMM IDCT4_1D_PACKED_WORD 0, 1, 2, 3, 4, 5 packssdw xm5, xm5 ; pw_2048 @@ -445,6 +446,7 @@ cglobal iadst_4x4_internal_10bpc, 0, 7, 6, dst, stride %endif jmp tx2q .pass2: + _CET_ENDBR lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main @@ -497,6 +499,7 @@ cglobal iflipadst_4x4_internal_10bpc, 0, 7, 6, dst, st vinserti128 m1, m6, xm4, 1 jmp m(iadst_4x4_internal_10bpc).pass1_end .pass2: + _CET_ENDBR lea r6, [deint_shuf+128] vextracti128 xm1, m0, 1 call m(iadst_4x4_internal_8bpc).main @@ -545,6 +548,7 @@ cglobal iidentity_4x4_internal_10bpc, 0, 7, 6, dst, st pshufb m0, m3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m1, [pw_1697x8] movq xm2, [dstq+strideq*0] movhps xm2, [dstq+strideq*1] @@ -585,6 +589,7 @@ cglobal idct_4x4_internal_12bpc, 0, 7, 8, dst, stride, vpermd m1, m3, m0 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: + _CET_ENDBR vpbroadcastd m5, [pd_2048] vpermq m0, m0, q3120 vpermq m1, m1, q3120 @@ -624,6 +629,7 @@ cglobal iadst_4x4_internal_12bpc, 0, 7, 8, dst, stride pminsd m1, m4 jmp tx2q .pass2: + _CET_ENDBR call .main_pass2 vinserti128 m0, m4, xm6, 1 vinserti128 m1, m2, xm3, 1 @@ -680,6 +686,7 @@ cglobal iflipadst_4x4_internal_12bpc, 0, 7, 8, dst, st vinserti128 m2, m6, xm4, 1 jmp m(iadst_4x4_internal_12bpc).pass1_end .pass2: + _CET_ENDBR call m(iadst_4x4_internal_12bpc).main_pass2 vinserti128 m0, m3, xm2, 1 vinserti128 m1, m6, xm4, 1 @@ -706,6 +713,7 @@ cglobal iidentity_4x4_internal_12bpc, 0, 7, 8, dst, st paddd m2, m3 jmp m(iadst_4x4_internal_12bpc).pass1_end2 .pass2: + _CET_ENDBR ; m0 = in0 in1 ; m1 = in2 in3 vpbroadcastd m3, [pd_5793] @@ -771,6 +779,7 @@ cglobal idct_4x8_internal_10bpc, 0, 7, 8, dst, stride, IDCT4_1D 0, 1, 2, 3, 4, 5, 6, 7 jmp tx2q .pass2: + _CET_ENDBR packssdw m0, m2 packssdw m1, m3 lea r6, [deint_shuf+128] @@ -828,6 +837,7 @@ cglobal iadst_4x8_internal_10bpc, 0, 7, 8, dst, stride REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: + _CET_ENDBR call .pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm0, xm1, xm2, xm3 @@ -949,6 +959,7 @@ cglobal iflipadst_4x8_internal_10bpc, 0, 7, 8, dst, st paddd m3, m5, m4 jmp m(iadst_4x8_internal_10bpc).pass1_end .pass2: + _CET_ENDBR call m(iadst_4x8_internal_10bpc).pass2_main mova xm4, [pw_2048_m2048] REPX {pmulhrsw x, xm4}, xm3, xm2, xm1, xm0 @@ -1002,6 +1013,7 @@ cglobal iidentity_4x8_internal_10bpc, 0, 7, 8, dst, st REPX {psrad x, 12}, m0, m1, m2, m3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m6, [pixel_10bpc_max] call .pass2_end RET @@ -1058,6 +1070,7 @@ INV_TXFM_4X8_FN dct, flipadst, 12 cglobal idct_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(idct_4x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -1104,6 +1117,7 @@ cglobal iadst_4x8_internal_12bpc, 0, 7, 10, dst, strid REPX {psrad x, 11}, m0, m1, m2, m3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -1185,6 +1199,7 @@ cglobal iflipadst_4x8_internal_12bpc, 0, 7, 10, dst, s psrad m3, m4, 1 jmp m(iadst_4x8_internal_12bpc).pass1_end .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -1206,6 +1221,7 @@ INV_TXFM_4X8_FN identity, identity, 12 cglobal iidentity_4x8_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_4x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR ; m0 = in0 in1 ; m1 = in2 in3 ; m2 = in4 in5 @@ -1248,6 +1264,7 @@ cglobal idct_4x16_internal_10bpc, 0, 7, 11, dst, strid REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: + _CET_ENDBR packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 @@ -1355,6 +1372,7 @@ cglobal iadst_4x16_internal_10bpc, 0, 7, 11, dst, stri psrad m7, 13 jmp tx2q .pass2: + _CET_ENDBR call .pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] @@ -1535,6 +1553,7 @@ cglobal iflipadst_4x16_internal_10bpc, 0, 7, 11, dst, psrad m7, m8, 13 jmp tx2q .pass2: + _CET_ENDBR call m(iadst_4x16_internal_10bpc).pass2_main vpbroadcastd m5, [pw_2048] vpbroadcastd m8, [pixel_10bpc_max] @@ -1596,6 +1615,7 @@ cglobal iidentity_4x16_internal_10bpc, 0, 7, 11, dst, REPX {psrad x, 13}, m0, m4, m1, m5, m2, m6, m3, m7 jmp tx2q .pass2: + _CET_ENDBR packssdw m0, m4 packssdw m1, m5 packssdw m2, m6 @@ -1666,6 +1686,7 @@ INV_TXFM_4X16_FN dct, flipadst, 12 cglobal idct_4x16_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(idct_4x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR punpckldq m8, m0, m1 punpckhdq m0, m1 punpckldq m9, m2, m3 @@ -1725,6 +1746,7 @@ cglobal iadst_4x16_internal_12bpc, 0, 7, 14, dst, stri psrad m7, 12 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -1819,6 +1841,7 @@ cglobal iflipadst_4x16_internal_12bpc, 0, 7, 14, dst, psrad m7, m8, 12 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -1876,6 +1899,7 @@ cglobal iidentity_4x16_internal_12bpc, 0, 7, 14, dst, REPX {psrad x, 1 }, m2, m6, m3, m7 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -1950,6 +1974,7 @@ cglobal idct_8x4_internal_10bpc, 0, 7, 10, dst, stride pshufd m3, m3, q1032 jmp tx2q .pass2: + _CET_ENDBR vbroadcasti128 m4, [deint_shuf] packssdw m0, m1 packssdw m2, m3 @@ -1997,6 +2022,7 @@ cglobal iadst_8x4_internal_10bpc, 0, 7, 10, dst, strid psignd m1, m6 ; out2 out3 jmp tx2q .pass2: + _CET_ENDBR call .pass2_main vpermq m0, m0, q3120 ; out0 out1 vpermq m2, m1, q3120 ; out2 out3 @@ -2064,6 +2090,7 @@ cglobal iflipadst_8x4_internal_10bpc, 0, 5, 10, dst, s psignd m2, m5, m6 jmp tx2q .pass2: + _CET_ENDBR call m(iadst_8x4_internal_10bpc).pass2_main vpermq m2, m0, q2031 vpermq m0, m1, q2031 @@ -2088,6 +2115,7 @@ cglobal iidentity_8x4_internal_10bpc, 0, 7, 10, dst, s REPX {paddd x, x }, m0, m1, m2, m3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m5, [pixel_10bpc_max] vpbroadcastd m4, [pw_1697x8] packssdw m0, m1 @@ -2135,6 +2163,7 @@ cglobal idct_8x4_internal_12bpc, 0, 7, 10, dst, stride vpbroadcastd m9, [clip_20b_max] jmp m(idct_8x4_internal_10bpc).pass1 .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -2159,6 +2188,7 @@ cglobal iadst_8x4_internal_12bpc, 0, 7, 10, dst, strid psignd m1, m6 ; out2 out3 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -2221,6 +2251,7 @@ cglobal iflipadst_8x4_internal_12bpc, 0, 5, 10, dst, s psignd m2, m5, m6 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m8, [clip_18b_min] vpbroadcastd m9, [clip_18b_max] REPX {pmaxsd x, m8}, m0, m1, m2, m3 @@ -2241,6 +2272,7 @@ INV_TXFM_8X4_FN identity, identity, 12 cglobal iidentity_8x4_internal_12bpc, 0, 7, 10, dst, stride, c, eob, tx2 jmp m(iidentity_8x4_internal_10bpc).pass1 .pass2: + _CET_ENDBR ; m0 = in0 in1 (interleaved) ; m1 = in2 in3 (interleaved) ; m2 = in4 in5 (interleaved) @@ -2352,6 +2384,7 @@ cglobal idct_8x8_internal_10bpc, 0, 7, 14, dst, stride call .round_shift1 jmp tx2q .pass2: + _CET_ENDBR call .transpose_8x8_packed call m(idct_8x8_internal_8bpc).main vpbroadcastd m12, [pw_2048] @@ -2471,6 +2504,7 @@ cglobal iadst_8x8_internal_10bpc, 0, 7, 14, dst, strid call .main_end jmp tx2q .pass2: + _CET_ENDBR call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 @@ -2532,6 +2566,7 @@ cglobal iflipadst_8x8_internal_10bpc, 0, 7, 14, dst, s call .main_end jmp tx2q .pass2: + _CET_ENDBR call m(idct_8x8_internal_10bpc).transpose_8x8_packed pshufd m4, m0, q1032 pshufd m5, m1, q1032 @@ -2588,6 +2623,7 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 14, dst, s mova m7, [cq+32*7] jmp tx2q .pass2: + _CET_ENDBR packssdw m3, m7 vpbroadcastd m7, [pixel_10bpc_max] .pass2_main: @@ -2678,6 +2714,7 @@ cglobal idct_8x8_internal_12bpc, 0, 7, 14, dst, stride vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -2722,6 +2759,7 @@ cglobal iadst_8x8_internal_12bpc, 0, 7, 14, dst, strid vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_main .pass2_end: packssdw m0, m1 @@ -2769,6 +2807,7 @@ cglobal iflipadst_8x8_internal_12bpc, 0, 7, 14, dst, s vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call m(iadst_8x8_internal_12bpc).pass2_main packssdw m7, m7, m6 packssdw m6, m1, m0 @@ -2791,6 +2830,7 @@ INV_TXFM_8X8_FN identity, identity, 12 cglobal iidentity_8x8_internal_12bpc, 0, 7, 14, dst, stride, c, eob, tx2 jmp m(iidentity_8x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR packssdw m3, m7 vpbroadcastd m7, [pixel_12bpc_max] jmp m(iidentity_8x8_internal_10bpc).pass2_main @@ -2849,6 +2889,7 @@ cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, strid REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call .transpose call m(idct_8x16_internal_8bpc).main vpbroadcastd m12, [pw_2048] @@ -3046,6 +3087,7 @@ cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stri REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end @@ -3112,6 +3154,7 @@ cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst, REPX {mova x, m8}, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call m(idct_8x16_internal_10bpc).transpose call m(iadst_8x16_internal_8bpc).main call m(iadst_8x16_internal_8bpc).main_pass2_end @@ -3179,6 +3222,7 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst, m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR packssdw m0, m8 packssdw m1, m9 packssdw m2, m10 @@ -3244,6 +3288,7 @@ cglobal idct_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, vpbroadcastd m13, [clip_20b_max] jmp m(idct_8x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR lea r6, [rsp+32*4] call .transpose vpbroadcastd m12, [clip_18b_min] @@ -3338,6 +3383,7 @@ cglobal iadst_8x16_internal_12bpc, 0, 7, 16, 32*8, dst vpbroadcastd m13, [clip_20b_max] jmp m(iadst_8x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR lea r6, [rsp+32*4] call .pass2_main call m(iadst_16x8_internal_10bpc).pass1_rotations @@ -3392,6 +3438,7 @@ cglobal iflipadst_8x16_internal_12bpc, 0, 7, 16, 32*8, vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_8x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR lea r6, [rsp+32*4] call m(iadst_8x16_internal_12bpc).pass2_main call m(iflipadst_16x8_internal_10bpc).pass1_rotations @@ -3405,6 +3452,7 @@ INV_TXFM_8X16_FN identity, identity, 0, 12 cglobal iidentity_8x16_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_8x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_main packssdw m0, m8 packssdw m1, m9 @@ -3521,6 +3569,7 @@ cglobal idct_16x4_internal_10bpc, 0, 7, 14, dst, strid REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: + _CET_ENDBR call .transpose_4x16_packed lea r6, [deint_shuf+128] call m(idct_16x4_internal_8bpc).main @@ -3653,6 +3702,7 @@ cglobal iadst_16x4_internal_10bpc, 0, 7, 14, dst, stri REPX {psrad x, 1}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main @@ -3738,6 +3788,7 @@ cglobal iflipadst_16x4_internal_10bpc, 0, 7, 14, dst, paddd m0, m8, m11 jmp m(iadst_16x4_internal_10bpc).pass1_end .pass2: + _CET_ENDBR call m(idct_16x4_internal_10bpc).transpose_4x16_packed lea r6, [deint_shuf+128] call m(iadst_16x4_internal_8bpc).main @@ -3772,6 +3823,7 @@ cglobal iidentity_16x4_internal_10bpc, 0, 7, 14, dst, REPX {psrad x, 12}, m0, m1, m2, m3, m4, m5, m6, m7 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x4_internal_10bpc).transpose_4x16_packed vpbroadcastd m7, [pw_1697x8] pmulhrsw m4, m7, m0 @@ -3794,6 +3846,7 @@ cglobal idct_16x4_internal_12bpc, 0, 7, 14, dst, strid vpbroadcastd m9, [clip_20b_max] jmp m(idct_16x4_internal_10bpc).pass1 .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -3843,6 +3896,7 @@ cglobal iadst_16x4_internal_12bpc, 0, 7, 14, dst, stri vpbroadcastd m13, [clip_20b_max] jmp m(iadst_16x4_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_main REPX {vpermq x, x, q3120}, m0, m1, m2, m3 REPX {pmulhrsw x, m4}, m0, m1, m2, m3 @@ -3898,6 +3952,7 @@ cglobal iflipadst_16x4_internal_12bpc, 0, 7, 14, dst, vpbroadcastd m13, [clip_20b_max] jmp m(iflipadst_16x4_internal_10bpc).pass1 .pass2: + _CET_ENDBR call m(iadst_16x4_internal_12bpc).pass2_main vpermq m7, m0, q3120 vpermq m6, m1, q3120 @@ -3947,6 +4002,7 @@ cglobal iidentity_16x4_internal_12bpc, 0, 7, 14, dst, paddd m7, m13 jmp tx2q .pass2: + _CET_ENDBR vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] REPX {pmaxsd x, m12}, m0, m1, m2, m3, m4, m5, m6, m7 @@ -4014,6 +4070,7 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, 32*8, dst, m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call .transpose call m(idct_16x8_internal_8bpc).main vpbroadcastd m10, [pw_2048] @@ -4144,6 +4201,7 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, 32*8, dst REPX {psrad x, 12}, m4, m5, m6, m7, m8, m9, m10, m11 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end @@ -4356,6 +4414,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, 32*8, call .pass1_rotations jmp m(iadst_16x8_internal_10bpc).pass1_end .pass2: + _CET_ENDBR call m(idct_16x8_internal_10bpc).transpose call m(iadst_16x8_internal_8bpc).main call m(iadst_16x8_internal_8bpc).main_pass2_end @@ -4443,6 +4502,7 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, 32*8, m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x8_internal_10bpc).transpose vpbroadcastd m10, [pw_4096] jmp m(idct_16x8_internal_10bpc).end @@ -4457,6 +4517,7 @@ cglobal idct_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_main RET ALIGN function_align @@ -4522,6 +4583,7 @@ cglobal iadst_16x8_internal_12bpc, 0, 7, 16, 32*8, dst vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_main call m(idct_16x8_internal_12bpc).end RET @@ -4564,6 +4626,7 @@ cglobal iflipadst_16x8_internal_12bpc, 0, 7, 16, 32*8, vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call m(iadst_16x8_internal_12bpc).pass2_main packssdw m13, m0, [cq+32* 8] packssdw m12, m1, [cq+32* 9] @@ -4591,6 +4654,7 @@ INV_TXFM_16X8_FN identity, identity, 12 cglobal iidentity_16x8_internal_12bpc, 0, 7, 16, 32*8, dst, stride, c, eob, tx2 jmp m(iidentity_16x8_internal_10bpc).pass1 .pass2: + _CET_ENDBR call m(idct_16x8_internal_10bpc).transpose2 vpbroadcastd m10, [pw_4096] pmulhrsw m0, m10 @@ -4708,6 +4772,7 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, 32*24, ds m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call .transpose lea r6, [pw_5+128] mova [rsp], m15 @@ -4944,6 +5009,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, 32*24, d sub r6, 32*8 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 @@ -5077,6 +5143,7 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, 32*2 paddd m1, [r6-32*3] jmp m(iadst_16x16_internal_10bpc).pass1_end .pass2: + _CET_ENDBR call m(idct_16x16_internal_10bpc).transpose lea r6, [pw_5+128] mova [rsp], m15 @@ -5163,6 +5230,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, 32*2 m8, m9, m10, m11, m12, m13, m14, m15 jmp tx2q .pass2: + _CET_ENDBR call m(idct_16x16_internal_10bpc).transpose mova [cq+32*0], m15 @@ -5191,6 +5259,7 @@ cglobal idct_16x16_internal_12bpc, 0, 7, 16, 32*24, ds vpbroadcastd m13, [clip_20b_max] jmp m(idct_16x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR mova [cq+32* 8], m8 mova [cq+32* 9], m9 mova [cq+32*10], m10 @@ -5318,6 +5387,7 @@ cglobal iadst_16x16_internal_12bpc, 0, 7, 16, 32*24, d vpbroadcastd m14, [clip_20b_max] jmp m(iadst_16x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR call .pass2_part1 call m(iadst_16x8_internal_10bpc).pass1_rotations call .pass2_part2 @@ -5487,6 +5557,7 @@ cglobal iflipadst_16x16_internal_12bpc, 0, 7, 16, 32*2 vpbroadcastd m14, [clip_20b_max] jmp m(iflipadst_16x16_internal_10bpc).pass1 .pass2: + _CET_ENDBR call m(iadst_16x16_internal_12bpc).pass2_part1 call m(iflipadst_16x8_internal_10bpc).pass1_rotations call m(iadst_16x16_internal_12bpc).pass2_part2 @@ -5564,6 +5635,7 @@ cglobal iidentity_16x16_internal_12bpc, 0, 7, 16, 32*2 mova m7, [cq+64*1] jmp tx2q .pass2: + _CET_ENDBR call m(iidentity_8x16_internal_12bpc).pass2_main call m(idct_16x16_internal_10bpc).transpose_fast test eobd, eobd @@ -6200,6 +6272,7 @@ cglobal inv_txfm_add_dct_dct_8x32_12bpc, 4, 7, 0, dst, .eob171: call .pass1_main .pass2: + _CET_ENDBR mov cq, r4 vpbroadcastd m12, [clip_18b_min] vpbroadcastd m13, [clip_18b_max] @@ -6428,6 +6501,7 @@ cglobal inv_txfm_add_dct_dct_32x8_10bpc, 4, 7, 0, dst, RET ALIGN function_align .pass2: + _CET_ENDBR call m(idct_16x8_internal_8bpc).main REPX {pmulhrsw x, m11}, m0, m1, m2, m3 call m(idct_16x8_internal_10bpc).write_16x4_start @@ -7240,6 +7314,7 @@ cglobal inv_txfm_add_dct_dct_32x32_10bpc, 4, 7, 0, dst cmp r6, r4 jl .fast_loop .pass2: + _CET_ENDBR lea r3, [rsp+32*3] mov r4, r6 lea r5, [r6+32*8] @@ -7525,6 +7600,7 @@ cglobal inv_txfm_add_dct_dct_16x64_10bpc, 4, 7, 0, dst cmp r6, r4 jl .fast_loop .pass2: + _CET_ENDBR lea r6, [pw_5+128] mova m0, [rsp+32* 2] ; in0 mova m1, [rsp+32* 6] ; in4 @@ -7868,6 +7944,7 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst cmp r6, r4 jl .fast_loop .pass2: + _CET_ENDBR lea r6, [pw_5 + 128] mov r10, rsp lea r8, [strideq*4] @@ -8084,6 +8161,7 @@ cglobal inv_txfm_add_dct_dct_64x16_10bpc, 4, 7, 0, dst dec r3d jg .fast_loop .pass2: + _CET_ENDBR lea r7, [r6-32*64] lea r4, [r6-32*32] lea r6, [pw_5+128] @@ -8317,6 +8395,7 @@ cglobal inv_txfm_add_dct_dct_64x32_10bpc, 4, 7, 0, dst cmp r6, r4 jl .fast_loop .pass2: + _CET_ENDBR lea r7, [r6-32*32] lea r5, [r6+32*8] lea r6, [pw_5+128] @@ -8461,6 +8540,7 @@ cglobal inv_txfm_add_dct_dct_64x64_10bpc, 4, 7, 0, dst cmp r6, r4 jl .fast_loop .pass2: + _CET_ENDBR lea r10, [r6-32*32] lea r6, [pw_5+128] lea r8, [strideq*4]