Index: src/x86/mc_avx2.asm --- src/x86/mc_avx2.asm.orig +++ src/x86/mc_avx2.asm @@ -212,10 +212,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, test mxyd, mxyd jnz .v .put: + _CET_ENDBR movzx wd, word [r7+wq*2+table_offset(put,)] add wq, r7 jmp wq .put_w2: + _CET_ENDBR movzx r6d, word [srcq+ssq*0] movzx r7d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -226,6 +228,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, jg .put_w2 RET .put_w4: + _CET_ENDBR mov r6d, [srcq+ssq*0] mov r7d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -236,6 +239,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, jg .put_w4 RET .put_w8: + _CET_ENDBR mov r6, [srcq+ssq*0] mov r7, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -246,6 +250,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, jg .put_w8 RET .put_w16: + _CET_ENDBR movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -257,6 +262,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w, RET INIT_YMM avx2 .put_w32: + _CET_ENDBR movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -267,6 +273,7 @@ INIT_YMM avx2 jg .put_w32 RET .put_w64: + _CET_ENDBR movu m0, [srcq+ssq*0+32*0] movu m1, [srcq+ssq*0+32*1] movu m2, [srcq+ssq*1+32*0] @@ -281,6 +288,7 @@ INIT_YMM avx2 jg .put_w64 RET .put_w128: + _CET_ENDBR movu m0, [srcq+32*0] movu m1, [srcq+32*1] movu m2, [srcq+32*2] @@ -295,6 +303,7 @@ INIT_YMM avx2 jg .put_w128 RET .h: + _CET_ENDBR ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 255 @@ -310,6 +319,7 @@ INIT_YMM avx2 add wq, r7 jmp wq .h_w2: + _CET_ENDBR movd xm0, [srcq+ssq*0] pinsrd xm0, [srcq+ssq*1], 1 lea srcq, [srcq+ssq*2] @@ -324,6 +334,7 @@ INIT_YMM avx2 jg .h_w2 RET .h_w4: + _CET_ENDBR mova xm4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+ssq*0] @@ -340,6 +351,7 @@ INIT_YMM avx2 jg .h_w4_loop RET .h_w8: + _CET_ENDBR movu xm0, [srcq+ssq*0] movu xm1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -357,6 +369,7 @@ INIT_YMM avx2 jg .h_w8 RET .h_w16: + _CET_ENDBR movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] @@ -376,6 +389,7 @@ INIT_YMM avx2 jg .h_w16 RET .h_w32: + _CET_ENDBR movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq @@ -392,6 +406,7 @@ INIT_YMM avx2 jg .h_w32 RET .h_w64: + _CET_ENDBR movu m0, [srcq+8*0] movu m1, [srcq+8*1] pshufb m0, m4 @@ -418,6 +433,7 @@ INIT_YMM avx2 jg .h_w64 RET .h_w128: + _CET_ENDBR mov r6, -32*3 .h_w128_loop: movu m0, [srcq+r6+32*3+8*0] @@ -438,6 +454,7 @@ INIT_YMM avx2 jg .h_w128 RET .v: + _CET_ENDBR movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)] imul mxyd, 255 vpbroadcastd m5, [pw_2048] @@ -447,6 +464,7 @@ INIT_YMM avx2 vpbroadcastw m4, xm4 jmp wq .v_w2: + _CET_ENDBR movd xm0, [srcq+ssq*0] .v_w2_loop: pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1 @@ -464,6 +482,7 @@ INIT_YMM avx2 jg .v_w2_loop RET .v_w4: + _CET_ENDBR movd xm0, [srcq+ssq*0] .v_w4_loop: vpbroadcastd xm2, [srcq+ssq*1] @@ -482,6 +501,7 @@ INIT_YMM avx2 jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq xm0, [srcq+ssq*0] .v_w8_loop: movq xm2, [srcq+ssq*1] @@ -501,6 +521,7 @@ INIT_YMM avx2 jg .v_w8_loop RET .v_w16: + _CET_ENDBR movu xm0, [srcq+ssq*0] .v_w16_loop: vbroadcasti128 m3, [srcq+ssq*1] @@ -522,6 +543,7 @@ INIT_YMM avx2 jg .v_w16_loop RET .v_w32: + _CET_ENDBR %macro PUT_BILIN_V_W32 0 movu m0, [srcq+ssq*0] %%loop: @@ -551,6 +573,7 @@ INIT_YMM avx2 PUT_BILIN_V_W32 RET .v_w64: + _CET_ENDBR movu m0, [srcq+32*0] movu m1, [srcq+32*1] .v_w64_loop: @@ -580,6 +603,7 @@ INIT_YMM avx2 jg .v_w64_loop RET .v_w128: + _CET_ENDBR lea r6d, [hq+(3<<8)] mov r4, srcq mov r7, dstq @@ -594,6 +618,7 @@ INIT_YMM avx2 jg .v_w128_loop RET .hv: + _CET_ENDBR ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)] @@ -606,6 +631,7 @@ INIT_YMM avx2 vpbroadcastw m6, xm6 jmp wq .hv_w2: + _CET_ENDBR vpbroadcastd xm0, [srcq+ssq*0] pshufb xm0, xm4 pmaddubsw xm0, xm5 @@ -630,6 +656,7 @@ INIT_YMM avx2 jg .hv_w2_loop RET .hv_w4: + _CET_ENDBR mova xm4, [bilin_h_shuf4] movddup xm0, [srcq+ssq*0] pshufb xm0, xm4 @@ -655,6 +682,7 @@ INIT_YMM avx2 jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR vbroadcasti128 m0, [srcq+ssq*0] pshufb m0, m4 pmaddubsw m0, m5 @@ -680,6 +708,7 @@ INIT_YMM avx2 jg .hv_w8_loop RET .hv_w16: + _CET_ENDBR movu m0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*0+8*1], 1 pshufb m0, m4 @@ -713,14 +742,17 @@ INIT_YMM avx2 jg .hv_w16_loop RET .hv_w128: + _CET_ENDBR lea r6d, [hq+(3<<16)] jmp .hv_w32_start .hv_w64: + _CET_ENDBR lea r6d, [hq+(1<<16)] .hv_w32_start: mov r4, srcq mov r7, dstq .hv_w32: + _CET_ENDBR %if WIN64 movaps r4m, xmm8 %endif @@ -784,6 +816,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, lea stride3q, [strideq*3] jmp wq .prep_w4: + _CET_ENDBR movd xm0, [srcq+strideq*0] pinsrd xm0, [srcq+strideq*1], 1 pinsrd xm0, [srcq+strideq*2], 2 @@ -797,6 +830,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w4 RET .prep_w8: + _CET_ENDBR movq xm0, [srcq+strideq*0] movhps xm0, [srcq+strideq*1] movq xm1, [srcq+strideq*2] @@ -813,6 +847,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w8 RET .prep_w16: + _CET_ENDBR pmovzxbw m0, [srcq+strideq*0] pmovzxbw m1, [srcq+strideq*1] pmovzxbw m2, [srcq+strideq*2] @@ -831,6 +866,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w16 RET .prep_w32: + _CET_ENDBR pmovzxbw m0, [srcq+strideq*0+16*0] pmovzxbw m1, [srcq+strideq*0+16*1] pmovzxbw m2, [srcq+strideq*1+16*0] @@ -849,6 +885,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w32 RET .prep_w64: + _CET_ENDBR pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] @@ -867,6 +904,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w64 RET .prep_w128: + _CET_ENDBR pmovzxbw m0, [srcq+16*0] pmovzxbw m1, [srcq+16*1] pmovzxbw m2, [srcq+16*2] @@ -897,6 +935,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w128 RET .h: + _CET_ENDBR ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] imul mxyd, 255 @@ -912,6 +951,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, lea stride3q, [strideq*3] jmp wq .h_w4: + _CET_ENDBR vbroadcasti128 m4, [bilin_h_shuf4] .h_w4_loop: movq xm0, [srcq+strideq*0] @@ -928,6 +968,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w4_loop RET .h_w8: + _CET_ENDBR .h_w8_loop: movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 @@ -945,6 +986,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w8_loop RET .h_w16: + _CET_ENDBR .h_w16_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 @@ -972,6 +1014,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w16_loop RET .h_w32: + _CET_ENDBR .h_w32_loop: movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 @@ -999,6 +1042,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w32_loop RET .h_w64: + _CET_ENDBR movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] @@ -1025,6 +1069,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w64 RET .h_w128: + _CET_ENDBR movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] @@ -1071,6 +1116,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w128 RET .v: + _CET_ENDBR WIN64_SPILL_XMM 7 movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)] imul mxyd, 255 @@ -1081,6 +1127,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, vpbroadcastw m6, xm6 jmp wq .v_w4: + _CET_ENDBR movd xm0, [srcq+strideq*0] .v_w4_loop: vpbroadcastd m1, [srcq+strideq*2] @@ -1101,6 +1148,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq xm0, [srcq+strideq*0] .v_w8_loop: vpbroadcastq m1, [srcq+strideq*2] @@ -1124,6 +1172,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w8_loop RET .v_w16: + _CET_ENDBR vbroadcasti128 m0, [srcq+strideq*0] .v_w16_loop: vbroadcasti128 m1, [srcq+strideq*1] @@ -1151,6 +1200,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w16_loop RET .v_w32: + _CET_ENDBR vpermq m0, [srcq+strideq*0], q3120 .v_w32_loop: vpermq m1, [srcq+strideq*1], q3120 @@ -1187,6 +1237,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w32_loop RET .v_w64: + _CET_ENDBR vpermq m0, [srcq+strideq*0+32*0], q3120 vpermq m1, [srcq+strideq*0+32*1], q3120 .v_w64_loop: @@ -1224,6 +1275,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w64_loop RET .v_w128: + _CET_ENDBR lea r6d, [hq+(3<<8)] mov r3, srcq mov r5, tmpq @@ -1257,6 +1309,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w128_loop0 RET .hv: + _CET_ENDBR ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) %assign stack_offset stack_offset - stack_size_padded @@ -1269,6 +1322,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, lea stride3q, [strideq*3] jmp wq .hv_w4: + _CET_ENDBR vbroadcasti128 m4, [bilin_h_shuf4] vpbroadcastq m0, [srcq+strideq*0] pshufb m0, m4 @@ -1294,6 +1348,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR vbroadcasti128 m0, [srcq+strideq*0] pshufb m0, m4 pmaddubsw m0, m5 @@ -1322,6 +1377,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w8_loop RET .hv_w16: + _CET_ENDBR movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 pshufb m0, m4 @@ -1349,6 +1405,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w16_loop RET .hv_w32: + _CET_ENDBR movu xm0, [srcq+8*0] vinserti128 m0, [srcq+8*1], 1 movu xm1, [srcq+8*2] @@ -1382,10 +1439,12 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w32_loop RET .hv_w128: + _CET_ENDBR lea r3d, [hq+(7<<8)] mov r6d, 256 jmp .hv_w64_start .hv_w64: + _CET_ENDBR lea r3d, [hq+(3<<8)] mov r6d, 128 .hv_w64_start: @@ -1489,6 +1548,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h %endif jmp wq .h: + _CET_ENDBR test myd, 0xf00 jnz .hv vpbroadcastd m5, [pw_34] ; 2 + (8 << 2) @@ -1508,6 +1568,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h add wq, r8 jmp wq .h_w2: + _CET_ENDBR movzx mxd, mxb dec srcq mova xm4, [subpel_h_shuf4] @@ -1529,6 +1590,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .h_w2_loop RET .h_w4: + _CET_ENDBR movzx mxd, mxb dec srcq vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2] @@ -1551,6 +1613,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .h_w4_loop RET .h_w8: + _CET_ENDBR %macro PUT_8TAP_H 4 ; dst/src, tmp[1-3] pshufb m%2, m%1, m7 pshufb m%3, m%1, m8 @@ -1578,6 +1641,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .h_w8 RET .h_w16: + _CET_ENDBR movu xm0, [srcq+ssq*0+8*0] vinserti128 m0, [srcq+ssq*1+8*0], 1 movu xm1, [srcq+ssq*0+8*1] @@ -1593,12 +1657,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .h_w16 RET .h_w32: + _CET_ENDBR xor r6d, r6d jmp .h_start .h_w64: + _CET_ENDBR mov r6, -32*1 jmp .h_start .h_w128: + _CET_ENDBR mov r6, -32*3 .h_start: sub srcq, r6 @@ -1620,6 +1687,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .h_loop RET .v: + _CET_ENDBR %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb @@ -1639,6 +1707,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h sub srcq, ss3q jmp r6 .v_w2: + _CET_ENDBR movd xm2, [srcq+ssq*0] pinsrw xm2, [srcq+ssq*1], 2 pinsrw xm2, [srcq+ssq*2], 4 @@ -1679,6 +1748,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .v_w2_loop RET .v_w4: + _CET_ENDBR movd xm2, [srcq+ssq*0] pinsrd xm2, [srcq+ssq*1], 1 pinsrd xm2, [srcq+ssq*2], 2 @@ -1719,6 +1789,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq xm1, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] vpbroadcastq m2, [srcq+ssq*2] @@ -1766,6 +1837,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h .v_w32: .v_w64: .v_w128: + _CET_ENDBR lea r6d, [wq*8-128] mov r4, srcq mov r7, dstq @@ -1834,6 +1906,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .v_w16_loop0 RET .hv: + _CET_ENDBR %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 cmp wd, 4 @@ -1914,6 +1987,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .hv_w2_loop RET .hv_w4: + _CET_ENDBR mova m6, [subpel_h_shuf4] vpbroadcastq m2, [srcq+ssq*0] vpbroadcastq m4, [srcq+ssq*1] @@ -1978,6 +2052,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR shr mxd, 16 sub srcq, 3 vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0] @@ -2153,6 +2228,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, %endif jmp wq .h: + _CET_ENDBR test myd, 0xf00 jnz .hv vpbroadcastd m4, [pw_8192] @@ -2171,6 +2247,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, add wq, r7 jmp wq .h_w4: + _CET_ENDBR movzx mxd, mxb dec srcq vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] @@ -2195,6 +2272,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .h_w4_loop RET .h_w8: + _CET_ENDBR movu xm0, [srcq+strideq*0] vinserti128 m0, [srcq+strideq*1], 1 lea srcq, [srcq+strideq*2] @@ -2205,6 +2283,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .h_w8 RET .h_w16: + _CET_ENDBR movu xm0, [srcq+strideq*0+8*0] vinserti128 m0, [srcq+strideq*0+8*1], 1 PREP_8TAP_H @@ -2219,12 +2298,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .h_w16 RET .h_w32: + _CET_ENDBR xor r6d, r6d jmp .h_start .h_w64: + _CET_ENDBR mov r6, -32*1 jmp .h_start .h_w128: + _CET_ENDBR mov r6, -32*3 .h_start: sub srcq, r6 @@ -2247,6 +2329,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .h_loop RET .v: + _CET_ENDBR %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 16 movzx mxd, myb ; Select 4-tap/8-tap filter multipliers. @@ -2266,6 +2349,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .v_w16 je .v_w8 .v_w4: + _CET_ENDBR movd xm0, [srcq+strideq*0] vpbroadcastd m1, [srcq+strideq*2] vpbroadcastd xm2, [srcq+strideq*1] @@ -2310,6 +2394,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq xm1, [srcq+strideq*0] vpbroadcastq m4, [srcq+strideq*1] vpbroadcastq m2, [srcq+strideq*2] @@ -2363,6 +2448,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .v_w8_loop RET .v_w16: + _CET_ENDBR add wd, wd mov r5, srcq mov r7, tmpq @@ -2430,6 +2516,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .v_w16_loop0 RET .hv: + _CET_ENDBR %assign stack_offset stack_offset - stack_size_padded %assign stack_size_padded 0 WIN64_SPILL_XMM 16 @@ -2454,6 +2541,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, pshufd m15, m0, q3333 jmp .hv_w8 .hv_w4: + _CET_ENDBR movzx mxd, mxb dec srcq vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2] @@ -2547,6 +2635,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w, jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR lea r6d, [wq*8-64] mov r5, srcq mov r7, tmpq @@ -2794,6 +2883,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr jmp wq %ifidn %1, put .w2: + _CET_ENDBR mov myd, mym movzx t0d, t0b dec srcq @@ -2904,6 +2994,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr jmp .w2_loop %endif .w4: + _CET_ENDBR mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b @@ -3043,22 +3134,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr lea srcq, [srcq+ssq*2] jmp .w4_loop .w8: + _CET_ENDBR mov dword [rsp+48], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: + _CET_ENDBR mov dword [rsp+48], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: + _CET_ENDBR mov dword [rsp+48], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: + _CET_ENDBR mov dword [rsp+48], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: + _CET_ENDBR mov dword [rsp+48], 16 movifprep tmp_stridem, 256 .w_start: @@ -3264,6 +3360,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr jmp wq %ifidn %1, put .dy1_w2: + _CET_ENDBR mov myd, mym movzx t0d, t0b dec srcq @@ -3351,6 +3448,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr RET %endif .dy1_w4: + _CET_ENDBR mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b @@ -3462,22 +3560,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr jg .dy1_w4_loop MC_8TAP_SCALED_RET .dy1_w8: + _CET_ENDBR mov dword [rsp+72], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: + _CET_ENDBR mov dword [rsp+72], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: + _CET_ENDBR mov dword [rsp+72], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: + _CET_ENDBR mov dword [rsp+72], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: + _CET_ENDBR mov dword [rsp+72], 16 movifprep tmp_stridem, 256 .dy1_w_start: @@ -3623,11 +3726,13 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr pblendw m3, m4, 0xaa jmp .dy1_vloop .dy2: + _CET_ENDBR movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: + _CET_ENDBR mov myd, mym movzx t0d, t0b dec srcq @@ -3718,6 +3823,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr RET %endif .dy2_w4: + _CET_ENDBR mov myd, mym vbroadcasti128 m7, [base+rescale_mul] movzx t0d, t0b @@ -3822,22 +3928,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr jg .dy2_w4_loop MC_8TAP_SCALED_RET .dy2_w8: + _CET_ENDBR mov dword [rsp+40], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: + _CET_ENDBR mov dword [rsp+40], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: + _CET_ENDBR mov dword [rsp+40], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: + _CET_ENDBR mov dword [rsp+40], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: + _CET_ENDBR mov dword [rsp+40], 16 movifprep tmp_stridem, 256 .dy2_w_start: @@ -4187,6 +4298,7 @@ ALIGN function_align lea tmp1d, [deltaq*3] sub gammad, tmp1d ; gamma -= delta*3 .main2: + _CET_ENDBR call .h psrld m6, m5, 16 pblendw m6, m0, 0xaa ; 57 @@ -4202,6 +4314,7 @@ ALIGN function_align ret ALIGN function_align .h: + _CET_ENDBR lea tmp1d, [mxq+alphaq*4] lea tmp2d, [mxq+alphaq*1] vbroadcasti128 m10, [srcq] @@ -4244,6 +4357,7 @@ ALIGN function_align lea stride3q, [strideq*3] jmp wq .w4: + _CET_ENDBR vextracti128 xm1, m0, 1 movd [dstq ], xm0 pextrd [dstq+strideq*1], xm0, 1 @@ -4277,6 +4391,7 @@ ALIGN function_align %1 0 lea dstq, [dstq+strideq*4] .w8: + _CET_ENDBR vextracti128 xm1, m0, 1 movq [dstq ], xm0 movq [dstq+strideq*1], xm1 @@ -4290,6 +4405,7 @@ ALIGN function_align %1 0 lea dstq, [dstq+strideq*4] .w16: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq ], xm0 vextracti128 [dstq+strideq*1], m0, 1 @@ -4305,6 +4421,7 @@ ALIGN function_align %1 0 lea dstq, [dstq+strideq*2] .w32: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 %1 2 @@ -4318,6 +4435,7 @@ ALIGN function_align %1 0 add dstq, strideq .w64: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq], m0 %1 2 @@ -4330,6 +4448,7 @@ ALIGN function_align %1 0 add dstq, strideq .w128: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+0*32], m0 %1 2 @@ -4499,6 +4618,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask lea r6, [dsq*3] jmp wq .w4: + _CET_ENDBR movd xm0, [dstq+dsq*0] pinsrd xm0, [dstq+dsq*1], 1 vpbroadcastd xm1, [dstq+dsq*2] @@ -4526,6 +4646,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask RET ALIGN function_align .w8: + _CET_ENDBR movq xm1, [dstq+dsq*0] movhps xm1, [dstq+dsq*1] vpbroadcastq m2, [dstq+dsq*2] @@ -4556,6 +4677,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR mova m0, [maskq] mova xm1, [dstq+dsq*0] vinserti128 m1, [dstq+dsq*1], 1 @@ -4579,6 +4701,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR mova m0, [maskq] mova m1, [dstq] mova m6, [maskq+tmpq] @@ -4610,6 +4733,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas add maskq, obmc_masks-blend_v_avx2_table jmp wq .w2: + _CET_ENDBR vpbroadcastd xm2, [maskq+2*2] .w2_s0_loop: movd xm0, [dstq+dsq*0] @@ -4628,6 +4752,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas RET ALIGN function_align .w4: + _CET_ENDBR vpbroadcastq xm2, [maskq+4*2] .w4_loop: movd xm0, [dstq+dsq*0] @@ -4646,6 +4771,7 @@ ALIGN function_align RET ALIGN function_align .w8: + _CET_ENDBR mova xm3, [maskq+8*2] .w8_loop: movq xm0, [dstq+dsq*0] @@ -4667,6 +4793,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR vbroadcasti128 m3, [maskq+16*2] vbroadcasti128 m4, [maskq+16*3] .w16_loop: @@ -4689,6 +4816,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR mova xm3, [maskq+16*4] vinserti128 m3, [maskq+16*6], 1 mova xm4, [maskq+16*5] @@ -4726,6 +4854,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas neg hq jmp wq .w2: + _CET_ENDBR movd xm0, [dstq+dsq*0] pinsrw xm0, [dstq+dsq*1], 1 movd xm2, [maskq+hq*2] @@ -4744,6 +4873,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas RET ALIGN function_align .w4: + _CET_ENDBR mova xm3, [blend_shuf] .w4_loop: movd xm0, [dstq+dsq*0] @@ -4764,6 +4894,7 @@ ALIGN function_align RET ALIGN function_align .w8: + _CET_ENDBR vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x03 .w8_loop: @@ -4788,6 +4919,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR vbroadcasti128 m4, [blend_shuf] shufpd m4, m4, 0x0c .w16_loop: @@ -4812,6 +4944,7 @@ ALIGN function_align RET ALIGN function_align .w32: ; w32/w64/w128 + _CET_ENDBR sub dsq, r6 .w32_loop0: vpbroadcastw m3, [maskq+hq*2] @@ -4913,6 +5046,7 @@ cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y %macro v_loop 3 ; need_left_ext, need_right_ext, suffix .v_loop_%3: + _CET_ENDBR %if %1 ; left extension xor r3, r3 @@ -5174,6 +5308,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, lea stride3q, [strideq*3] jmp wq .w4: + _CET_ENDBR vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 @@ -5199,6 +5334,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, movq [maskq], xm4 RET .w4_h16: + _CET_ENDBR W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] phaddd m4, m5 @@ -5226,6 +5362,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, lea dstq, [dstq+strideq*4] add maskq, 8 .w8: + _CET_ENDBR vextracti128 xm2, m4, 1 vextracti128 xm1, m0, 1 psubw xm4, xm8, xm4 @@ -5247,6 +5384,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, lea dstq, [dstq+strideq*4] add maskq, 16 .w16: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 @@ -5272,6 +5410,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, lea dstq, [dstq+strideq*2] add maskq, 16 .w32: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 @@ -5296,6 +5435,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, W_MASK 0, 4, 0, 1 add dstq, strideq .w64: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 @@ -5322,6 +5462,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1, W_MASK 0, 4, 0, 1 add dstq, strideq .w128: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 @@ -5381,6 +5522,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, lea stride3q, [strideq*3] jmp wq .w4: + _CET_ENDBR vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 @@ -5403,6 +5545,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, mova [maskq], xm5 RET .w4_h16: + _CET_ENDBR W_MASK 0, 5, 2, 3 lea dstq, [dstq+strideq*4] packuswb m4, m5 @@ -5428,6 +5571,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, lea dstq, [dstq+strideq*4] add maskq, 16 .w8: + _CET_ENDBR vextracti128 xm5, m4, 1 vextracti128 xm1, m0, 1 packuswb xm4, xm5 @@ -5449,6 +5593,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, lea dstq, [dstq+strideq*4] add maskq, 32 .w16: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 @@ -5471,6 +5616,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, lea dstq, [dstq+strideq*2] add maskq, 32 .w32: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], m0 W_MASK 0, 5, 2, 3 @@ -5491,6 +5637,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, add dstq, strideq add maskq, 32 .w64: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 @@ -5511,6 +5658,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1, add dstq, strideq add maskq, 32*2 .w128: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 W_MASK 0, 5, 2, 3 @@ -5551,6 +5699,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t lea stride3q, [strideq*3] jmp wq .w4: + _CET_ENDBR vextracti128 xm1, m0, 1 movd [dstq+strideq*0], xm0 pextrd [dstq+strideq*1], xm0, 1 @@ -5587,6 +5736,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t lea dstq, [dstq+strideq*4] add maskq, 32 .w8: + _CET_ENDBR vextracti128 xm1, m0, 1 movq [dstq+strideq*0], xm0 movq [dstq+strideq*1], xm1 @@ -5603,6 +5753,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t lea dstq, [dstq+strideq*2] add maskq, 32 .w16: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+strideq*0], xm0 vextracti128 [dstq+strideq*1], m0, 1 @@ -5617,6 +5768,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t add dstq, strideq add maskq, 32 .w32: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq], m0 mova [maskq], m4 @@ -5630,6 +5782,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t add dstq, strideq add maskq, 32*2 .w64: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4 @@ -5647,6 +5800,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t add dstq, strideq add maskq, 32*4 .w128: + _CET_ENDBR vpermq m0, m0, q3120 mova [dstq+32*0], m0 mova [maskq+32*0], m4