ports/multimedia/dav1d/patches/patch-src_x86_mc16_sse_asm

896 lines
26 KiB
Text

Index: src/x86/mc16_sse.asm
--- src/x86/mc16_sse.asm.orig
+++ src/x86/mc16_sse.asm
@@ -184,12 +184,14 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
test mxyd, mxyd
jnz .v
.put:
+ _CET_ENDBR
tzcnt wd, wd
movzx wd, word [base+put_ssse3_table+wq*2]
add wq, t0
movifnidn hd, hm
jmp wq
.put_w2:
+ _CET_ENDBR
mov r4d, [srcq+ssq*0]
mov r6d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -200,6 +202,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w2
RET
.put_w4:
+ _CET_ENDBR
movq m0, [srcq+ssq*0]
movq m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -210,6 +213,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w4
RET
.put_w8:
+ _CET_ENDBR
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -220,6 +224,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w8
RET
.put_w16:
+ _CET_ENDBR
movu m0, [srcq+ssq*0+16*0]
movu m1, [srcq+ssq*0+16*1]
movu m2, [srcq+ssq*1+16*0]
@@ -234,6 +239,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w16
RET
.put_w32:
+ _CET_ENDBR
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
@@ -248,6 +254,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w32
RET
.put_w64:
+ _CET_ENDBR
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
@@ -270,6 +277,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w64
RET
.put_w128:
+ _CET_ENDBR
add srcq, 16*8
add dstq, 16*8
.put_w128_loop:
@@ -311,6 +319,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .put_w128_loop
RET
.h:
+ _CET_ENDBR
movd m5, mxyd
mov mxyd, r7m ; my
mova m4, [base+pw_16]
@@ -329,6 +338,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
cmp wd, -4
je .h_w4
.h_w2:
+ _CET_ENDBR
movq m1, [srcq+ssq*0]
movhps m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -346,6 +356,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .h_w2
RET
.h_w4:
+ _CET_ENDBR
movq m0, [srcq+ssq*0]
movhps m0, [srcq+ssq*1]
movq m1, [srcq+ssq*0+2]
@@ -363,6 +374,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .h_w4
RET
.h_w8:
+ _CET_ENDBR
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*0+2]
pmullw m0, m4
@@ -385,6 +397,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .h_w8
RET
.h_w16:
+ _CET_ENDBR
lea srcq, [srcq+wq*2]
lea dstq, [dstq+wq*2]
neg wq
@@ -415,6 +428,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .h_w16_loop0
RET
.v:
+ _CET_ENDBR
shl mxyd, 11
movd m5, mxyd
pshufb m5, [base+pw_256]
@@ -423,6 +437,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .v_w8
je .v_w4
.v_w2:
+ _CET_ENDBR
movd m0, [srcq+ssq*0]
.v_w2_loop:
movd m1, [srcq+ssq*1]
@@ -441,6 +456,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .v_w2_loop
RET
.v_w4:
+ _CET_ENDBR
movq m0, [srcq+ssq*0]
.v_w4_loop:
movq m1, [srcq+ssq*1]
@@ -458,6 +474,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .v_w4_loop
RET
.v_w8:
+ _CET_ENDBR
%if ARCH_X86_64
%if WIN64
push r7
@@ -508,6 +525,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
%endif
RET
.hv:
+ _CET_ENDBR
WIN64_SPILL_XMM 8
shl mxyd, 11
mova m3, [base+pw_2]
@@ -525,6 +543,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .hv_w8
je .hv_w4
.hv_w2:
+ _CET_ENDBR
movddup m0, [srcq+ssq*0]
pshufhw m1, m0, q0321
pmullw m0, m4
@@ -557,6 +576,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .hv_w2_loop
RET
.hv_w4:
+ _CET_ENDBR
movddup m0, [srcq+ssq*0]
movddup m1, [srcq+ssq*0+2]
pmullw m0, m4
@@ -589,6 +609,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
jg .hv_w4_loop
RET
.hv_w8:
+ _CET_ENDBR
%if ARCH_X86_64
%if WIN64
push r7
@@ -672,6 +693,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
test mxyd, mxyd
jnz .v
.prep:
+ _CET_ENDBR
tzcnt wd, wd
movzx wd, word [base+prep_ssse3_table+wq*2]
mov r5d, r7m ; bitdepth_max
@@ -682,6 +704,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
+ _CET_ENDBR
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*2]
@@ -698,6 +721,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w4
RET
.prep_w8:
+ _CET_ENDBR
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*1]
movu m2, [srcq+strideq*2]
@@ -714,6 +738,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w8
RET
.prep_w16:
+ _CET_ENDBR
movu m0, [srcq+strideq*0+16*0]
movu m1, [srcq+strideq*0+16*1]
movu m2, [srcq+strideq*1+16*0]
@@ -730,6 +755,8 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w16
RET
.prep_w32:
+ _CET_ENDBR
+ _CET_ENDBR
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
@@ -746,6 +773,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w32
RET
.prep_w64:
+ _CET_ENDBR
movu m0, [srcq+16*0]
movu m1, [srcq+16*1]
movu m2, [srcq+16*2]
@@ -772,6 +800,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w64
RET
.prep_w128:
+ _CET_ENDBR
movu m0, [srcq+16* 0]
movu m1, [srcq+16* 1]
movu m2, [srcq+16* 2]
@@ -818,6 +847,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .prep_w128
RET
.h:
+ _CET_ENDBR
movd m4, mxyd
mov mxyd, r6m ; my
mova m3, [base+pw_16]
@@ -835,6 +865,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
je .h_w8
jg .h_w16
.h_w4:
+ _CET_ENDBR
movq m0, [srcq+strideq*0]
movhps m0, [srcq+strideq*1]
movq m1, [srcq+strideq*0+2]
@@ -851,6 +882,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .h_w4
RET
.h_w8:
+ _CET_ENDBR
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
@@ -873,6 +905,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .h_w8
RET
.h_w16:
+ _CET_ENDBR
lea srcq, [srcq+wq*2]
neg wq
.h_w16_loop0:
@@ -902,6 +935,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .h_w16_loop0
RET
.v:
+ _CET_ENDBR
movd m4, mxyd
mova m3, [base+pw_16]
pshufb m4, [base+pw_256]
@@ -916,6 +950,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
je .v_w8
jg .v_w16
.v_w4:
+ _CET_ENDBR
movq m0, [srcq+strideq*0]
.v_w4_loop:
movq m2, [srcq+strideq*1]
@@ -934,6 +969,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .v_w4_loop
RET
.v_w8:
+ _CET_ENDBR
movu m0, [srcq+strideq*0]
.v_w8_loop:
movu m2, [srcq+strideq*1]
@@ -956,6 +992,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .v_w8_loop
RET
.v_w16:
+ _CET_ENDBR
%if WIN64
push r7
%endif
@@ -1011,6 +1048,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
%endif
RET
.hv:
+ _CET_ENDBR
WIN64_SPILL_XMM 7
shl mxyd, 11
movd m6, mxyd
@@ -1019,6 +1057,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
je .hv_w8
jg .hv_w16
.hv_w4:
+ _CET_ENDBR
movddup m0, [srcq+strideq*0]
movddup m1, [srcq+strideq*0+2]
pmullw m0, m3
@@ -1048,6 +1087,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .hv_w4_loop
RET
.hv_w8:
+ _CET_ENDBR
movu m0, [srcq+strideq*0]
movu m1, [srcq+strideq*0+2]
pmullw m0, m3
@@ -1084,6 +1124,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
jg .hv_w8_loop
RET
.hv_w16:
+ _CET_ENDBR
%if WIN64
push r7
%endif
@@ -1234,6 +1275,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
%endif
jmp wq
.h:
+ _CET_ENDBR
test myd, 0xf00
jnz .hv
mov myd, r8m
@@ -1252,6 +1294,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
psraw m3, 8 ; sign-extend
je .h_w4
.h_w2:
+ _CET_ENDBR
mova m2, [base+spel_h_shuf2]
pshufd m3, m3, q2121
.h_w2_loop:
@@ -1277,6 +1320,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .h_w2_loop
RET
.h_w4:
+ _CET_ENDBR
WIN64_SPILL_XMM 8
mova m6, [base+spel_h_shufA]
mova m7, [base+spel_h_shufB]
@@ -1302,6 +1346,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
%if WIN64
%assign stack_offset stack_offset - stack_size_padded
WIN64_SPILL_XMM 12
@@ -1378,6 +1423,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .h_w8_loop0
RET
.v:
+ _CET_ENDBR
movzx mxd, myb
shr myd, 16
cmp hd, 6
@@ -1418,6 +1464,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
cmp wd, 2
jne .v_w4
.v_w2:
+ _CET_ENDBR
movd m1, [srcq+ssq*0]
movd m4, [srcq+ssq*1]
movd m2, [srcq+ssq*2]
@@ -1466,6 +1513,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .v_w2_loop
RET
.v_w4:
+ _CET_ENDBR
%if ARCH_X86_32
shl wd, 14
%if STACK_ALIGNMENT < 16
@@ -1604,6 +1652,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .v_w4_loop0
RET
.hv:
+ _CET_ENDBR
%if STACK_ALIGNMENT < 16
%xdefine rstk rsp
%else
@@ -1741,8 +1790,10 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
jg .hv_w2_loop
RET
.hv_w8:
+ _CET_ENDBR
shr mxd, 16
.hv_w4:
+ _CET_ENDBR
movq m2, [base+subpel_filters+mxq*8]
movzx mxd, myb
shr myd, 16
@@ -2060,6 +2111,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
%endif
jmp wq
.h:
+ _CET_ENDBR
test myd, 0xf00
jnz .hv
movifnidn ssq, r2mp
@@ -2107,6 +2159,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
WIN64_SPILL_XMM 11
shr mxd, 16
movq m2, [base+subpel_filters+mxq*8]
@@ -2177,6 +2230,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
jg .h_w8_loop0
RET
.v:
+ _CET_ENDBR
movzx mxd, myb
shr myd, 16
cmp hd, 4
@@ -2339,6 +2393,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
jg .v_loop0
RET
.hv:
+ _CET_ENDBR
%if STACK_ALIGNMENT < 16
%xdefine rstk rsp
%else
@@ -3008,6 +3063,7 @@ cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, s
jmp wq
%if isput
.w2:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
movzx t0d, t0b
@@ -3259,6 +3315,7 @@ cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, s
%endif
INIT_XMM ssse3
.w4:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
@@ -3695,22 +3752,27 @@ INIT_XMM ssse3
%define stk rsp+0x20
%endif
.w8:
+ _CET_ENDBR
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .w_start
.w16:
+ _CET_ENDBR
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .w_start
.w32:
+ _CET_ENDBR
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .w_start
.w64:
+ _CET_ENDBR
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .w_start
.w128:
+ _CET_ENDBR
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.w_start:
@@ -4277,6 +4339,7 @@ INIT_XMM ssse3
jmp wq
%if isput
.dy1_w2:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
movzx t0d, t0b
@@ -4477,6 +4540,7 @@ INIT_XMM ssse3
%endif
INIT_XMM ssse3
.dy1_w4:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
@@ -4857,22 +4921,27 @@ INIT_XMM ssse3
MC_8TAP_SCALED_RET ; why not jz .ret?
INIT_XMM ssse3
.dy1_w8:
+ _CET_ENDBR
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .dy1_w_start
.dy1_w16:
+ _CET_ENDBR
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .dy1_w_start
.dy1_w32:
+ _CET_ENDBR
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .dy1_w_start
.dy1_w64:
+ _CET_ENDBR
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .dy1_w_start
.dy1_w128:
+ _CET_ENDBR
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.dy1_w_start:
@@ -5395,11 +5464,13 @@ INIT_XMM ssse3
%define stk rsp+0x20
%endif
.dy2:
+ _CET_ENDBR
movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
add wq, base_reg
jmp wq
%if isput
.dy2_w2:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m13
@@ -5609,6 +5680,7 @@ INIT_XMM ssse3
%endif
INIT_XMM ssse3
.dy2_w4:
+ _CET_ENDBR
%if ARCH_X86_64
mov myd, mym
mova [rsp+0x10], m11
@@ -5946,22 +6018,27 @@ INIT_XMM ssse3
MC_8TAP_SCALED_RET ; why not jz .ret?
INIT_XMM ssse3
.dy2_w8:
+ _CET_ENDBR
mov dword [stk+0xf0], 1
movifprep tmp_stridem, 16
jmp .dy2_w_start
.dy2_w16:
+ _CET_ENDBR
mov dword [stk+0xf0], 2
movifprep tmp_stridem, 32
jmp .dy2_w_start
.dy2_w32:
+ _CET_ENDBR
mov dword [stk+0xf0], 4
movifprep tmp_stridem, 64
jmp .dy2_w_start
.dy2_w64:
+ _CET_ENDBR
mov dword [stk+0xf0], 8
movifprep tmp_stridem, 128
jmp .dy2_w_start
.dy2_w128:
+ _CET_ENDBR
mov dword [stk+0xf0], 16
movifprep tmp_stridem, 256
.dy2_w_start:
@@ -6812,6 +6889,7 @@ ALIGN function_align
ret
ALIGN function_align
.h:
+ _CET_ENDBR
lea tmpd, [mxq+alphaq]
shr mxd, 10
movq m3, [filterq+mxq*8]
@@ -6890,6 +6968,7 @@ ALIGN function_align
call .main
lea dstq, [dstq+strideq*2]
.w4:
+ _CET_ENDBR
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
@@ -6903,6 +6982,7 @@ ALIGN function_align
call .main
lea dstq, [dstq+strideq*2]
.w8:
+ _CET_ENDBR
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
@@ -6912,6 +6992,7 @@ ALIGN function_align
call .main
add dstq, strideq
.w16:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
dec hd
@@ -6921,6 +7002,7 @@ ALIGN function_align
call .main
add dstq, strideq
.w32:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -6933,6 +7015,7 @@ ALIGN function_align
call .main
add dstq, strideq
.w64:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -6951,6 +7034,7 @@ ALIGN function_align
call .main
add dstq, strideq
.w128:
+ _CET_ENDBR
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
@@ -7174,6 +7258,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 4
.w4:
+ _CET_ENDBR
movq [dstq+strideq*0], m0
phaddw m2, m3
movhps [dstq+strideq*1], m0
@@ -7193,6 +7278,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 4
.w8:
+ _CET_ENDBR
mova [dstq+strideq*0], m0
paddw m2, m3
phaddw m2, m2
@@ -7209,6 +7295,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 8
.w16:
+ _CET_ENDBR
mova [dstq+strideq*1+16*0], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*1], m3
@@ -7231,6 +7318,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 16
.w32:
+ _CET_ENDBR
mova [dstq+strideq*1+16*0], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*1], m3
@@ -7266,6 +7354,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 16*2
.w64:
+ _CET_ENDBR
mova [dstq+strideq*1+16*1], m2
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*1+16*2], m3
@@ -7330,6 +7419,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
lea dstq, [dstq+strideq*2]
add maskq, 16*4
.w128:
+ _CET_ENDBR
mova [dstq+strideq*1+16* 1], m2
mova [dstq+strideq*0+16* 0], m0
mova [dstq+strideq*1+16* 2], m3
@@ -7511,6 +7601,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w4:
+ _CET_ENDBR
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
@@ -7524,6 +7615,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w8:
+ _CET_ENDBR
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
@@ -7534,6 +7626,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w16:
+ _CET_ENDBR
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
call .main
@@ -7546,6 +7639,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w32:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -7558,6 +7652,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w64:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -7576,6 +7671,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w128:
+ _CET_ENDBR
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
@@ -7649,6 +7745,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w4:
+ _CET_ENDBR
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
lea dstq, [dstq+strideq*2]
@@ -7662,6 +7759,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w8:
+ _CET_ENDBR
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
sub hd, 2
@@ -7672,6 +7770,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
lea dstq, [dstq+strideq*2]
.w16:
+ _CET_ENDBR
mova [dstq+strideq*0+16*0], m0
mova [dstq+strideq*0+16*1], m1
call .main
@@ -7684,6 +7783,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w32:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -7696,6 +7796,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w64:
+ _CET_ENDBR
mova [dstq+16*0], m0
mova [dstq+16*1], m1
call .main
@@ -7714,6 +7815,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
call .main
add dstq, strideq
.w128:
+ _CET_ENDBR
mova [dstq+16* 0], m0
mova [dstq+16* 1], m1
call .main
@@ -7770,6 +7872,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
pxor m6, m6
jmp wq
.w4:
+ _CET_ENDBR
mova m5, [maskq]
movq m0, [dstq+strideq*0]
movhps m0, [dstq+strideq*1]
@@ -7796,6 +7899,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
jg .w4
RET
.w8:
+ _CET_ENDBR
mova m5, [maskq]
mova m0, [dstq+strideq*0]
mova m1, [dstq+strideq*1]
@@ -7818,6 +7922,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
jg .w8
RET
.w16:
+ _CET_ENDBR
mova m5, [maskq]
mova m0, [dstq+16*0]
mova m1, [dstq+16*1]
@@ -7840,6 +7945,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
jg .w16
RET
.w32:
+ _CET_ENDBR
mova m5, [maskq+16*0]
mova m0, [dstq+16*0]
mova m1, [dstq+16*1]
@@ -7886,6 +7992,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
add wq, r5
jmp wq
.w2:
+ _CET_ENDBR
movd m4, [base+obmc_masks+2*2]
.w2_loop:
movd m0, [dstq+strideq*0]
@@ -7906,6 +8013,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
jg .w2_loop
RET
.w4:
+ _CET_ENDBR
movddup m2, [base+obmc_masks+4*2]
.w4_loop:
movq m0, [dstq+strideq*0]
@@ -7922,6 +8030,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
jg .w4_loop
RET
.w8:
+ _CET_ENDBR
mova m4, [base+obmc_masks+8*2]
.w8_loop:
mova m0, [dstq+strideq*0]
@@ -7942,6 +8051,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
mova m4, [base+obmc_masks+16*2]
movq m5, [base+obmc_masks+16*3]
.w16_loop:
@@ -7963,6 +8073,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
%if WIN64
movaps [rsp+8], m6
%endif
@@ -8030,6 +8141,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
neg hq
jmp wq
.w2:
+ _CET_ENDBR
movd m0, [dstq+dsq*0]
movd m2, [dstq+dsq*1]
movd m3, [maskq+hq*2]
@@ -8048,6 +8160,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
jl .w2
RET
.w4:
+ _CET_ENDBR
mova m3, [base+blend_shuf]
.w4_loop:
movq m0, [dstq+dsq*0]
@@ -8066,6 +8179,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
jl .w4_loop
RET
.w8:
+ _CET_ENDBR
movddup m5, [base+blend_shuf+8]
%if WIN64
movaps [rsp+ 8], m6
@@ -8097,6 +8211,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
%endif
RET
.w16:
+ _CET_ENDBR
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0, 2
@@ -8105,6 +8220,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
jl .w16
RET
.w32:
+ _CET_ENDBR
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0
@@ -8114,6 +8230,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
jl .w32
RET
.w64:
+ _CET_ENDBR
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0
@@ -8125,6 +8242,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
jl .w64
RET
.w128:
+ _CET_ENDBR
movd m5, [maskq+hq*2]
pshufb m5, m4
BLEND_H_ROW 0, 0