Index: src/x86/mc_sse.asm --- src/x86/mc_sse.asm.orig +++ src/x86/mc_sse.asm @@ -344,11 +344,13 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, test mxyd, mxyd jnz .v .put: + _CET_ENDBR movzx wd, word [t0+wq*2+table_offset(put,)] add wq, t0 RESTORE_DSQ_32 t0 jmp wq .put_w2: + _CET_ENDBR movzx r4d, word [srcq+ssq*0] movzx r6d, word [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -359,6 +361,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w2 RET .put_w4: + _CET_ENDBR mov r4d, [srcq+ssq*0] mov r6d, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -369,6 +372,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w4 RET .put_w8: + _CET_ENDBR movq m0, [srcq+ssq*0] movq m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -379,6 +383,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w8 RET .put_w16: + _CET_ENDBR movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -389,6 +394,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w16 RET .put_w32: + _CET_ENDBR movu m0, [srcq+ssq*0+16*0] movu m1, [srcq+ssq*0+16*1] movu m2, [srcq+ssq*1+16*0] @@ -403,6 +409,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w32 RET .put_w64: + _CET_ENDBR movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] @@ -417,6 +424,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w64 RET .put_w128: + _CET_ENDBR movu m0, [srcq+16*0] movu m1, [srcq+16*1] movu m2, [srcq+16*2] @@ -439,6 +447,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .put_w128 RET .h: + _CET_ENDBR ; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4 ; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4 imul mxyd, 0x00ff00ff @@ -456,6 +465,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, movifnidn dsq, dsmp jmp wq .h_w2: + _CET_ENDBR pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5} .h_w2_loop: movd m0, [srcq+ssq*0] @@ -475,6 +485,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w2_loop RET .h_w4: + _CET_ENDBR movq m4, [srcq+ssq*0] movhps m4, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -490,6 +501,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w4 RET .h_w8: + _CET_ENDBR movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -507,6 +519,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w8 RET .h_w16: + _CET_ENDBR movu m0, [srcq+8*0] movu m1, [srcq+8*1] add srcq, ssq @@ -523,6 +536,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w16 RET .h_w32: + _CET_ENDBR movu m0, [srcq+mmsize*0+8*0] movu m1, [srcq+mmsize*0+8*1] pshufb m0, m4 @@ -549,6 +563,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w32 RET .h_w64: + _CET_ENDBR mov r6, -16*3 .h_w64_loop: movu m0, [srcq+r6+16*3+8*0] @@ -569,6 +584,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w64 RET .h_w128: + _CET_ENDBR mov r6, -16*7 .h_w128_loop: movu m0, [srcq+r6+16*7+8*0] @@ -589,6 +605,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .h_w128 RET .v: + _CET_ENDBR movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)] imul mxyd, 0x00ff00ff mova m5, [base+pw_2048] @@ -599,6 +616,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, movifnidn dsq, dsmp jmp wq .v_w2: + _CET_ENDBR movd m0, [srcq+ssq*0] .v_w2_loop: pinsrw m0, [srcq+ssq*1], 1 ; 0 1 @@ -618,6 +636,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .v_w2_loop RET .v_w4: + _CET_ENDBR movd m0, [srcq+ssq*0] .v_w4_loop: movd m2, [srcq+ssq*1] @@ -639,6 +658,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq m0, [srcq+ssq*0] .v_w8_loop: movq m2, [srcq+ssq*1] @@ -687,17 +707,22 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg %%loop %endmacro .v_w16: + _CET_ENDBR PUT_BILIN_V_W16 RET .v_w128: + _CET_ENDBR lea r6d, [hq+(7<<16)] jmp .v_w16gt .v_w64: + _CET_ENDBR lea r6d, [hq+(3<<16)] jmp .v_w16gt .v_w32: + _CET_ENDBR lea r6d, [hq+(1<<16)] .v_w16gt: + _CET_ENDBR mov r4, srcq %if ARCH_X86_64 mov r7, dstq @@ -722,6 +747,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .v_w16gt RET .hv: + _CET_ENDBR ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8 ; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4 movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)] @@ -735,6 +761,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, punpcklqdq m6, m6 jmp wq .hv_w2: + _CET_ENDBR RESTORE_DSQ_32 t0 movd m0, [srcq+ssq*0] punpckldq m0, m0 @@ -769,6 +796,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .hv_w2_loop RET .hv_w4: + _CET_ENDBR mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+ssq*0] movifnidn dsq, dsmp @@ -796,6 +824,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR movu m0, [srcq+ssq*0] movifnidn dsq, dsmp pshufb m0, m4 @@ -826,12 +855,15 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, jg .hv_w8_loop RET .hv_w128: + _CET_ENDBR lea r6d, [hq+(7<<16)] jmp .hv_w16_start .hv_w64: + _CET_ENDBR lea r6d, [hq+(3<<16)] jmp .hv_w16_start .hv_w32: + _CET_ENDBR lea r6d, [hq+(1<<16)] .hv_w16_start: mov r4, srcq @@ -841,6 +873,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w, mov r7, dstq %endif .hv_w16: + _CET_ENDBR movifnidn dsq, dsmp %if WIN64 movaps r4m, m8 @@ -967,6 +1000,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, test mxyd, mxyd jnz .v .prep: + _CET_ENDBR %if notcpuflag(ssse3) add r6, prep_ssse3 - prep_sse2 jmp prep_ssse3 @@ -977,6 +1011,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, lea stride3q, [strideq*3] jmp wq .prep_w4: + _CET_ENDBR movd m0, [srcq+strideq*0] movd m1, [srcq+strideq*1] movd m2, [srcq+strideq*2] @@ -995,6 +1030,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w4 RET .prep_w8: + _CET_ENDBR movq m0, [srcq+strideq*0] movq m1, [srcq+strideq*1] movq m2, [srcq+strideq*2] @@ -1017,6 +1053,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w8 RET .prep_w16: + _CET_ENDBR movu m1, [srcq+strideq*0] movu m3, [srcq+strideq*1] lea srcq, [srcq+strideq*2] @@ -1037,12 +1074,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .prep_w16 RET .prep_w128: + _CET_ENDBR mov r3, -128 jmp .prep_w32_start .prep_w64: + _CET_ENDBR mov r3, -64 jmp .prep_w32_start .prep_w32: + _CET_ENDBR mov r3, -32 .prep_w32_start: sub srcq, r3 @@ -1072,6 +1112,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, RET %endif .h: + _CET_ENDBR ; 16 * src[x] + (mx * (src[x + 1] - src[x])) ; = (16 - mx) * src[x] + mx * src[x + 1] %if cpuflag(ssse3) @@ -1095,6 +1136,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, add wq, r6 jmp wq .h_w4: + _CET_ENDBR %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] %endif @@ -1116,6 +1158,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w4_loop RET .h_w8: + _CET_ENDBR lea stride3q, [strideq*3] .h_w8_loop: movu m0, [srcq+strideq*0] @@ -1140,6 +1183,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w8_loop RET .h_w16: + _CET_ENDBR movu m0, [srcq+strideq*0+8*0] movu m1, [srcq+strideq*0+8*1] movu m2, [srcq+strideq*1+8*0] @@ -1162,12 +1206,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w16 RET .h_w128: + _CET_ENDBR mov r3, -128 jmp .h_w32_start .h_w64: + _CET_ENDBR mov r3, -64 jmp .h_w32_start .h_w32: + _CET_ENDBR mov r3, -32 .h_w32_start: sub srcq, r3 @@ -1198,6 +1245,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .h_w32_vloop RET .v: + _CET_ENDBR %if notcpuflag(ssse3) %assign stack_offset stack_offset - stack_size_padded WIN64_SPILL_XMM 8 @@ -1217,6 +1265,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, pshufd m5, m5, q0000 jmp wq .v_w4: + _CET_ENDBR movd m0, [srcq+strideq*0] .v_w4_loop: movd m1, [srcq+strideq*1] @@ -1239,6 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w4_loop RET .v_w8: + _CET_ENDBR movq m0, [srcq+strideq*0] .v_w8_loop: movq m1, [srcq+strideq*1] @@ -1263,6 +1313,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w8_loop RET .v_w16: + _CET_ENDBR movu m0, [srcq+strideq*0] .v_w16_loop: movu m1, [srcq+strideq*1] @@ -1299,14 +1350,17 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .v_w16_loop RET .v_w128: + _CET_ENDBR lea r3d, [hq+(3<<8)] mov r6d, 256 jmp .v_w32_start .v_w64: + _CET_ENDBR lea r3d, [hq+(1<<8)] mov r6d, 128 jmp .v_w32_start .v_w32: + _CET_ENDBR xor r3d, r3d mov r6d, 64 .v_w32_start: @@ -1372,6 +1426,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, %endif RET .hv: + _CET_ENDBR ; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4 ; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4) movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)] @@ -1394,6 +1449,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, pshufd m6, m6, q0000 jmp wq .hv_w4: + _CET_ENDBR %if cpuflag(ssse3) mova m4, [base+bilin_h_shuf4] movddup m0, [srcq+strideq*0] @@ -1429,6 +1485,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w4_loop RET .hv_w8: + _CET_ENDBR movu m0, [srcq+strideq*0] PSHUFB_BILIN_H8 m0, m4 PMADDUBSW m0, m5, m7, m4, 0 ; 0 @@ -1454,18 +1511,22 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w, jg .hv_w8_loop RET .hv_w128: + _CET_ENDBR lea r3d, [hq+(7<<8)] mov r5d, 256 jmp .hv_w16_start .hv_w64: + _CET_ENDBR lea r3d, [hq+(3<<8)] mov r5d, 128 jmp .hv_w16_start .hv_w32: + _CET_ENDBR lea r3d, [hq+(1<<8)] mov r5d, 64 jmp .hv_w16_start .hv_w16: + _CET_ENDBR xor r3d, r3d mov r5d, 32 .hv_w16_start: @@ -1627,6 +1688,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h lea r6, [ssq*3] jmp wq .h: + _CET_ENDBR %if ARCH_X86_32 test ssd, 0xf00 %else @@ -1654,6 +1716,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h add wq, base_reg jmp wq .h_w2: + _CET_ENDBR %if ARCH_X86_32 and mxd, 0x7f %else @@ -1684,6 +1747,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h jg .h_w2_loop RET .h_w4: + _CET_ENDBR %if ARCH_X86_32 and mxd, 0x7f %else @@ -1735,6 +1799,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h psraw %1, 6 %endmacro .h_w8: + _CET_ENDBR movu m0, [srcq+ssq*0] movu m1, [srcq+ssq*1] lea srcq, [srcq+ssq*2] @@ -1755,15 +1820,19 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h jg .h_w8 RET .h_w128: + _CET_ENDBR mov r4, -16*7 jmp .h_w16_start .h_w64: + _CET_ENDBR mov r4, -16*3 jmp .h_w16_start .h_w32: + _CET_ENDBR mov r4, -16*1 jmp .h_w16_start .h_w16: + _CET_ENDBR xor r4d, r4d .h_w16_start: sub srcq, r4 @@ -1785,6 +1854,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h jg .h_w16_loop_v RET .v: + _CET_ENDBR %if ARCH_X86_32 movzx mxd, ssb shr ssd, 16 @@ -1840,6 +1910,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h %endif jmp r6 .v_w2: + _CET_ENDBR movd m1, [srcq+ssq*0] movd m0, [srcq+ssq*1] %if ARCH_X86_32 @@ -1895,12 +1966,14 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h jg .v_w2_loop RET .v_w4: + _CET_ENDBR %if ARCH_X86_32 .v_w8: .v_w16: .v_w32: .v_w64: .v_w128: + _CET_ENDBR shl wd, 14 %if STACK_ALIGNMENT < 16 %define dstm [rsp+mmsize*4+gprsize] @@ -1979,6 +2052,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h .v_w32: .v_w64: .v_w128: + _CET_ENDBR lea r6d, [wq*8-64] mov r4, srcq mov r7, dstq @@ -2048,6 +2122,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h %undef subpel2 %undef subpel3 .hv: + _CET_ENDBR %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 @@ -2113,6 +2188,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h cmp wd, 4 je .hv_w4 .hv_w2: + _CET_ENDBR mova m6, [base+subpel_h_shuf4] movq m2, [srcq+ssq*0] ; 0 movhps m2, [srcq+ssq*1] ; 0 _ 1 @@ -2192,6 +2268,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h %undef w8192reg %undef d512reg .hv_w4: + _CET_ENDBR %define hv4_line_0_0 4 %define hv4_line_0_1 5 %define hv4_line_0_2 6 @@ -2369,6 +2446,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h %undef subpelv2 %undef subpelv3 .hv_w8: + _CET_ENDBR %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 @@ -2869,6 +2947,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, %endif jmp wq .h: + _CET_ENDBR LEA base_reg, prep%+SUFFIX test myd, 0xf00 jnz .hv @@ -2918,6 +2997,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, add wq, base_reg jmp wq .h_w4: + _CET_ENDBR %if ARCH_X86_32 and mxd, 0x7f %else @@ -3038,6 +3118,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, jg .h_w4_loop RET .h_w8: + _CET_ENDBR %if cpuflag(ssse3) PREP_8TAP_H 0, srcq+strideq*0 PREP_8TAP_H 1, srcq+strideq*1 @@ -3056,15 +3137,19 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, jg .h_w8 RET .h_w16: + _CET_ENDBR mov r3, -16*1 jmp .h_start .h_w32: + _CET_ENDBR mov r3, -16*2 jmp .h_start .h_w64: + _CET_ENDBR mov r3, -16*4 jmp .h_start .h_w128: + _CET_ENDBR mov r3, -16*8 .h_start: sub srcq, r3 @@ -3090,6 +3175,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, jg .h_loop RET .v: + _CET_ENDBR LEA base_reg, prep%+SUFFIX %if ARCH_X86_32 mov mxd, myd @@ -3149,6 +3235,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, jns .v_w8 %endif .v_w4: + _CET_ENDBR %if notcpuflag(ssse3) pxor m6, m6 %if ARCH_X86_64 @@ -3262,6 +3349,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, RET %if ARCH_X86_64 .v_w8: + _CET_ENDBR lea r6d, [wq*8-64] mov r5, srcq mov r8, tmpq @@ -3359,6 +3447,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, %undef subpel2 %undef subpel3 .hv: + _CET_ENDBR %assign stack_offset org_stack_offset cmp wd, 4 jg .hv_w8 @@ -3659,6 +3748,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w, %undef subpelv2 %undef subpelv3 .hv_w8: + _CET_ENDBR %assign stack_offset org_stack_offset %define hv8_line_1 0 %define hv8_line_2 1 @@ -4317,6 +4407,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr jmp wq %ifidn %1, put .w2: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -4551,6 +4642,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr %endif INIT_XMM ssse3 .w4: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -4896,22 +4988,27 @@ INIT_XMM ssse3 jmp .w4_loop INIT_XMM ssse3 .w8: + _CET_ENDBR mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .w_start .w16: + _CET_ENDBR mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .w_start .w32: + _CET_ENDBR mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .w_start .w64: + _CET_ENDBR mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .w_start .w128: + _CET_ENDBR mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .w_start: @@ -5427,11 +5524,13 @@ INIT_XMM ssse3 jmp .vloop INIT_XMM ssse3 .dy1: + _CET_ENDBR movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy1_w2: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -5606,6 +5705,7 @@ INIT_XMM ssse3 %endif INIT_XMM ssse3 .dy1_w4: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -5940,22 +6040,27 @@ INIT_XMM ssse3 jmp .dy1_w4_loop INIT_XMM ssse3 .dy1_w8: + _CET_ENDBR mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy1_w_start .dy1_w16: + _CET_ENDBR mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy1_w_start .dy1_w32: + _CET_ENDBR mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy1_w_start .dy1_w64: + _CET_ENDBR mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy1_w_start .dy1_w128: + _CET_ENDBR mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy1_w_start: @@ -6406,11 +6511,13 @@ INIT_XMM ssse3 jmp .dy1_vloop INIT_XMM ssse3 .dy2: + _CET_ENDBR movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] add wq, base_reg jmp wq %ifidn %1, put .dy2_w2: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -6592,6 +6699,7 @@ INIT_XMM ssse3 %endif INIT_XMM ssse3 .dy2_w4: + _CET_ENDBR %if ARCH_X86_64 mov myd, mym movzx t0d, t0b @@ -6842,22 +6950,27 @@ INIT_XMM ssse3 MC_8TAP_SCALED_RET INIT_XMM ssse3 .dy2_w8: + _CET_ENDBR mov dword [rsp+0x90], 1 movifprep tmp_stridem, 16 jmp .dy2_w_start .dy2_w16: + _CET_ENDBR mov dword [rsp+0x90], 2 movifprep tmp_stridem, 32 jmp .dy2_w_start .dy2_w32: + _CET_ENDBR mov dword [rsp+0x90], 4 movifprep tmp_stridem, 64 jmp .dy2_w_start .dy2_w64: + _CET_ENDBR mov dword [rsp+0x90], 8 movifprep tmp_stridem, 128 jmp .dy2_w_start .dy2_w128: + _CET_ENDBR mov dword [rsp+0x90], 16 movifprep tmp_stridem, 256 .dy2_w_start: @@ -7852,6 +7965,7 @@ ALIGN function_align ret ALIGN function_align .h: + _CET_ENDBR %if ARCH_X86_32 %define m8 m3 %define m9 m4 @@ -8022,6 +8136,7 @@ DECLARE_REG_TMP 6, 7 %1 0 lea dstq, [dstq+strideq*4] .w4: ; tile 4x + _CET_ENDBR movd [dstq ], m0 ; copy dw[0] pshuflw m1, m0, q1032 ; swap dw[1] and dw[0] movd [dstq+strideq*1], m1 ; copy dw[1] @@ -8037,6 +8152,7 @@ DECLARE_REG_TMP 6, 7 %1 0 lea dstq, [dstq+strideq*2] .w8: + _CET_ENDBR movq [dstq ], m0 movhps [dstq+strideq*1], m0 sub hd, 2 @@ -8047,6 +8163,7 @@ DECLARE_REG_TMP 6, 7 %1 0 lea dstq, [dstq+strideq] .w16: + _CET_ENDBR mova [dstq ], m0 dec hd jg .w16_loop @@ -8056,6 +8173,7 @@ DECLARE_REG_TMP 6, 7 %1 0 lea dstq, [dstq+strideq] .w32: + _CET_ENDBR mova [dstq ], m0 %1 2 mova [dstq + 16 ], m0 @@ -8067,6 +8185,8 @@ DECLARE_REG_TMP 6, 7 %1 0 add dstq, strideq .w64: + _CET_ENDBR + _CET_ENDBR %assign i 0 %rep 4 mova [dstq + i*16 ], m0 @@ -8083,6 +8203,7 @@ DECLARE_REG_TMP 6, 7 %1 0 add dstq, strideq .w128: + _CET_ENDBR %assign i 0 %rep 8 mova [dstq + i*16 ], m0 @@ -8264,6 +8385,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 4 lea dstq, [dstq+strideq*2] .w4: + _CET_ENDBR pshufd m3, m2, q2020 pshufd m2, m2, q3131 psubw m1, m7, m3 @@ -8287,6 +8409,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 4 lea dstq, [dstq+strideq*2] .w8: + _CET_ENDBR movhlps m3, m2 psubw m1, m7, m2 psubw m1, m3 @@ -8303,6 +8426,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 8 lea dstq, [dstq+strideq*2] .w16: + _CET_ENDBR mova [dstq+strideq*1], m2 mova [dstq+strideq*0], m0 call .main @@ -8320,6 +8444,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 16 lea dstq, [dstq+strideq*2] .w32: + _CET_ENDBR mova [maskq], m2 mova [dstq+strideq*0+16*0], m0 call .main @@ -8334,6 +8459,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 16*2 lea dstq, [dstq+strideq*2] .w64: + _CET_ENDBR mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main @@ -8354,6 +8480,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t add maskq, 16*4 lea dstq, [dstq+strideq*2] .w128: + _CET_ENDBR mova [maskq+16*0], m2 mova [dstq+strideq*0+16*0], m0 call .main @@ -8457,6 +8584,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 8 lea dstq, [dstq+strideq*2] .w4: + _CET_ENDBR packuswb m2, m2 psubb m1, m7, m2 %if ARCH_X86_64 @@ -8482,6 +8610,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 16 lea dstq, [dstq+strideq*2] .w8: + _CET_ENDBR W_MASK_422_BACKUP 0 movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 @@ -8498,6 +8627,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 16 lea dstq, [dstq+strideq*2] .w16: + _CET_ENDBR W_MASK_422_BACKUP 0 mova [dstq+strideq*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main @@ -8511,6 +8641,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 16 add dstq, strideq .w32: + _CET_ENDBR W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main @@ -8524,6 +8655,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 16*2 add dstq, strideq .w64: + _CET_ENDBR W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main @@ -8543,6 +8675,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1, add maskq, 16*4 add dstq, strideq .w128: + _CET_ENDBR W_MASK_422_BACKUP 0 mova [dstq+16*0], m0 call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main @@ -8593,6 +8726,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main lea dstq, [dstq+strideq*2] .w4: + _CET_ENDBR movd [dstq+strideq*0], m0 pshuflw m1, m0, q1032 movd [dstq+strideq*1], m1 @@ -8608,6 +8742,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main lea dstq, [dstq+strideq*2] .w8: + _CET_ENDBR movq [dstq+strideq*0], m0 movhps [dstq+strideq*1], m0 sub hd, 2 @@ -8617,6 +8752,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main lea dstq, [dstq+strideq*2] .w16: + _CET_ENDBR mova [dstq+strideq*0], m0 call .main mova [dstq+strideq*1], m0 @@ -8627,6 +8763,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main add dstq, strideq .w32: + _CET_ENDBR mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 @@ -8637,6 +8774,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main add dstq, strideq .w64: + _CET_ENDBR mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 @@ -8651,6 +8789,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t call .main add dstq, strideq .w128: + _CET_ENDBR mova [dstq+16*0], m0 call .main mova [dstq+16*1], m0 @@ -8729,6 +8868,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask lea r6, [dsq*3] jmp wq .w4: + _CET_ENDBR movq m0, [maskq]; m movd m1, [dstq+dsq*0] ; a movd m6, [dstq+dsq*1] @@ -8750,6 +8890,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask jg .w4 RET .w8: + _CET_ENDBR mova m0, [maskq]; m movq m1, [dstq+dsq*0] ; a movhps m1, [dstq+dsq*1] @@ -8764,6 +8905,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask jg .w8 RET .w16: + _CET_ENDBR mova m0, [maskq]; m mova m1, [dstq] ; a mova m6, [tmpq] ; b @@ -8776,6 +8918,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask jg .w16 RET .w32: + _CET_ENDBR %assign i 0 %rep 2 mova m0, [maskq+16*i]; m @@ -8803,6 +8946,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas add maskq, obmc_masks-blend_v_ssse3_table jmp wq .w2: + _CET_ENDBR movd m3, [maskq+4] punpckldq m3, m3 ; 2 mask blend is provided for 4 pixels / 2 lines @@ -8824,6 +8968,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w2_loop RET .w4: + _CET_ENDBR movddup m3, [maskq+8] ; 4 mask blend is provided for 8 pixels / 2 lines .w4_loop: @@ -8844,6 +8989,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w4_loop RET .w8: + _CET_ENDBR mova m3, [maskq+16] ; 8 mask blend is provided for 16 pixels .w8_loop: @@ -8859,6 +9005,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w8_loop RET .w16: + _CET_ENDBR ; 16 mask blend is provided for 32 pixels mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0]) mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1]) @@ -8873,6 +9020,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas jg .w16_loop RET .w32: + _CET_ENDBR %if WIN64 mova [rsp+8], xmm6 %endif @@ -8922,6 +9070,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas neg hq jmp wq .w2: + _CET_ENDBR movd m0, [dstq+dsq*0] pinsrw m0, [dstq+dsq*1], 1 movd m2, [maskq+hq*2] @@ -8941,6 +9090,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w2 RET .w4: + _CET_ENDBR %if ARCH_X86_32 mova m3, [base+blend_shuf] %else @@ -8966,6 +9116,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas jl .w4_loop RET .w8: + _CET_ENDBR movd m4, [maskq+hq*2] punpcklwd m4, m4 pshufd m3, m4, q0000 @@ -8983,6 +9134,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas RET ; w16/w32/w64/w128 .w16: + _CET_ENDBR %if ARCH_X86_32 mov r6d, wm %endif