896 lines
26 KiB
Text
896 lines
26 KiB
Text
Index: src/x86/mc16_sse.asm
|
|
--- src/x86/mc16_sse.asm.orig
|
|
+++ src/x86/mc16_sse.asm
|
|
@@ -184,12 +184,14 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
test mxyd, mxyd
|
|
jnz .v
|
|
.put:
|
|
+ _CET_ENDBR
|
|
tzcnt wd, wd
|
|
movzx wd, word [base+put_ssse3_table+wq*2]
|
|
add wq, t0
|
|
movifnidn hd, hm
|
|
jmp wq
|
|
.put_w2:
|
|
+ _CET_ENDBR
|
|
mov r4d, [srcq+ssq*0]
|
|
mov r6d, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
@@ -200,6 +202,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w2
|
|
RET
|
|
.put_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+ssq*0]
|
|
movq m1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
@@ -210,6 +213,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w4
|
|
RET
|
|
.put_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+ssq*0]
|
|
movu m1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
@@ -220,6 +224,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w8
|
|
RET
|
|
.put_w16:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+ssq*0+16*0]
|
|
movu m1, [srcq+ssq*0+16*1]
|
|
movu m2, [srcq+ssq*1+16*0]
|
|
@@ -234,6 +239,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w16
|
|
RET
|
|
.put_w32:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+16*0]
|
|
movu m1, [srcq+16*1]
|
|
movu m2, [srcq+16*2]
|
|
@@ -248,6 +254,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w32
|
|
RET
|
|
.put_w64:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+16*0]
|
|
movu m1, [srcq+16*1]
|
|
movu m2, [srcq+16*2]
|
|
@@ -270,6 +277,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w64
|
|
RET
|
|
.put_w128:
|
|
+ _CET_ENDBR
|
|
add srcq, 16*8
|
|
add dstq, 16*8
|
|
.put_w128_loop:
|
|
@@ -311,6 +319,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .put_w128_loop
|
|
RET
|
|
.h:
|
|
+ _CET_ENDBR
|
|
movd m5, mxyd
|
|
mov mxyd, r7m ; my
|
|
mova m4, [base+pw_16]
|
|
@@ -329,6 +338,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
cmp wd, -4
|
|
je .h_w4
|
|
.h_w2:
|
|
+ _CET_ENDBR
|
|
movq m1, [srcq+ssq*0]
|
|
movhps m1, [srcq+ssq*1]
|
|
lea srcq, [srcq+ssq*2]
|
|
@@ -346,6 +356,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .h_w2
|
|
RET
|
|
.h_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+ssq*0]
|
|
movhps m0, [srcq+ssq*1]
|
|
movq m1, [srcq+ssq*0+2]
|
|
@@ -363,6 +374,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .h_w4
|
|
RET
|
|
.h_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+ssq*0]
|
|
movu m1, [srcq+ssq*0+2]
|
|
pmullw m0, m4
|
|
@@ -385,6 +397,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .h_w8
|
|
RET
|
|
.h_w16:
|
|
+ _CET_ENDBR
|
|
lea srcq, [srcq+wq*2]
|
|
lea dstq, [dstq+wq*2]
|
|
neg wq
|
|
@@ -415,6 +428,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .h_w16_loop0
|
|
RET
|
|
.v:
|
|
+ _CET_ENDBR
|
|
shl mxyd, 11
|
|
movd m5, mxyd
|
|
pshufb m5, [base+pw_256]
|
|
@@ -423,6 +437,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .v_w8
|
|
je .v_w4
|
|
.v_w2:
|
|
+ _CET_ENDBR
|
|
movd m0, [srcq+ssq*0]
|
|
.v_w2_loop:
|
|
movd m1, [srcq+ssq*1]
|
|
@@ -441,6 +456,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .v_w2_loop
|
|
RET
|
|
.v_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+ssq*0]
|
|
.v_w4_loop:
|
|
movq m1, [srcq+ssq*1]
|
|
@@ -458,6 +474,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .v_w4_loop
|
|
RET
|
|
.v_w8:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
%if WIN64
|
|
push r7
|
|
@@ -508,6 +525,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
%endif
|
|
RET
|
|
.hv:
|
|
+ _CET_ENDBR
|
|
WIN64_SPILL_XMM 8
|
|
shl mxyd, 11
|
|
mova m3, [base+pw_2]
|
|
@@ -525,6 +543,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .hv_w8
|
|
je .hv_w4
|
|
.hv_w2:
|
|
+ _CET_ENDBR
|
|
movddup m0, [srcq+ssq*0]
|
|
pshufhw m1, m0, q0321
|
|
pmullw m0, m4
|
|
@@ -557,6 +576,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .hv_w2_loop
|
|
RET
|
|
.hv_w4:
|
|
+ _CET_ENDBR
|
|
movddup m0, [srcq+ssq*0]
|
|
movddup m1, [srcq+ssq*0+2]
|
|
pmullw m0, m4
|
|
@@ -589,6 +609,7 @@ cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w,
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
%if WIN64
|
|
push r7
|
|
@@ -672,6 +693,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
test mxyd, mxyd
|
|
jnz .v
|
|
.prep:
|
|
+ _CET_ENDBR
|
|
tzcnt wd, wd
|
|
movzx wd, word [base+prep_ssse3_table+wq*2]
|
|
mov r5d, r7m ; bitdepth_max
|
|
@@ -682,6 +704,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.prep_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+strideq*0]
|
|
movhps m0, [srcq+strideq*1]
|
|
movq m1, [srcq+strideq*2]
|
|
@@ -698,6 +721,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w4
|
|
RET
|
|
.prep_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+strideq*0]
|
|
movu m1, [srcq+strideq*1]
|
|
movu m2, [srcq+strideq*2]
|
|
@@ -714,6 +738,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w8
|
|
RET
|
|
.prep_w16:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+strideq*0+16*0]
|
|
movu m1, [srcq+strideq*0+16*1]
|
|
movu m2, [srcq+strideq*1+16*0]
|
|
@@ -730,6 +755,8 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w16
|
|
RET
|
|
.prep_w32:
|
|
+ _CET_ENDBR
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+16*0]
|
|
movu m1, [srcq+16*1]
|
|
movu m2, [srcq+16*2]
|
|
@@ -746,6 +773,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w32
|
|
RET
|
|
.prep_w64:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+16*0]
|
|
movu m1, [srcq+16*1]
|
|
movu m2, [srcq+16*2]
|
|
@@ -772,6 +800,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w64
|
|
RET
|
|
.prep_w128:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+16* 0]
|
|
movu m1, [srcq+16* 1]
|
|
movu m2, [srcq+16* 2]
|
|
@@ -818,6 +847,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .prep_w128
|
|
RET
|
|
.h:
|
|
+ _CET_ENDBR
|
|
movd m4, mxyd
|
|
mov mxyd, r6m ; my
|
|
mova m3, [base+pw_16]
|
|
@@ -835,6 +865,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
je .h_w8
|
|
jg .h_w16
|
|
.h_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+strideq*0]
|
|
movhps m0, [srcq+strideq*1]
|
|
movq m1, [srcq+strideq*0+2]
|
|
@@ -851,6 +882,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .h_w4
|
|
RET
|
|
.h_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+strideq*0]
|
|
movu m1, [srcq+strideq*0+2]
|
|
pmullw m0, m3
|
|
@@ -873,6 +905,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .h_w8
|
|
RET
|
|
.h_w16:
|
|
+ _CET_ENDBR
|
|
lea srcq, [srcq+wq*2]
|
|
neg wq
|
|
.h_w16_loop0:
|
|
@@ -902,6 +935,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .h_w16_loop0
|
|
RET
|
|
.v:
|
|
+ _CET_ENDBR
|
|
movd m4, mxyd
|
|
mova m3, [base+pw_16]
|
|
pshufb m4, [base+pw_256]
|
|
@@ -916,6 +950,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
je .v_w8
|
|
jg .v_w16
|
|
.v_w4:
|
|
+ _CET_ENDBR
|
|
movq m0, [srcq+strideq*0]
|
|
.v_w4_loop:
|
|
movq m2, [srcq+strideq*1]
|
|
@@ -934,6 +969,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .v_w4_loop
|
|
RET
|
|
.v_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+strideq*0]
|
|
.v_w8_loop:
|
|
movu m2, [srcq+strideq*1]
|
|
@@ -956,6 +992,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .v_w8_loop
|
|
RET
|
|
.v_w16:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
push r7
|
|
%endif
|
|
@@ -1011,6 +1048,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
%endif
|
|
RET
|
|
.hv:
|
|
+ _CET_ENDBR
|
|
WIN64_SPILL_XMM 7
|
|
shl mxyd, 11
|
|
movd m6, mxyd
|
|
@@ -1019,6 +1057,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
je .hv_w8
|
|
jg .hv_w16
|
|
.hv_w4:
|
|
+ _CET_ENDBR
|
|
movddup m0, [srcq+strideq*0]
|
|
movddup m1, [srcq+strideq*0+2]
|
|
pmullw m0, m3
|
|
@@ -1048,6 +1087,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .hv_w4_loop
|
|
RET
|
|
.hv_w8:
|
|
+ _CET_ENDBR
|
|
movu m0, [srcq+strideq*0]
|
|
movu m1, [srcq+strideq*0+2]
|
|
pmullw m0, m3
|
|
@@ -1084,6 +1124,7 @@ cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w
|
|
jg .hv_w8_loop
|
|
RET
|
|
.hv_w16:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
push r7
|
|
%endif
|
|
@@ -1234,6 +1275,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
%endif
|
|
jmp wq
|
|
.h:
|
|
+ _CET_ENDBR
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
mov myd, r8m
|
|
@@ -1252,6 +1294,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
psraw m3, 8 ; sign-extend
|
|
je .h_w4
|
|
.h_w2:
|
|
+ _CET_ENDBR
|
|
mova m2, [base+spel_h_shuf2]
|
|
pshufd m3, m3, q2121
|
|
.h_w2_loop:
|
|
@@ -1277,6 +1320,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .h_w2_loop
|
|
RET
|
|
.h_w4:
|
|
+ _CET_ENDBR
|
|
WIN64_SPILL_XMM 8
|
|
mova m6, [base+spel_h_shufA]
|
|
mova m7, [base+spel_h_shufB]
|
|
@@ -1302,6 +1346,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .h_w4_loop
|
|
RET
|
|
.h_w8:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
%assign stack_offset stack_offset - stack_size_padded
|
|
WIN64_SPILL_XMM 12
|
|
@@ -1378,6 +1423,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .h_w8_loop0
|
|
RET
|
|
.v:
|
|
+ _CET_ENDBR
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 6
|
|
@@ -1418,6 +1464,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
cmp wd, 2
|
|
jne .v_w4
|
|
.v_w2:
|
|
+ _CET_ENDBR
|
|
movd m1, [srcq+ssq*0]
|
|
movd m4, [srcq+ssq*1]
|
|
movd m2, [srcq+ssq*2]
|
|
@@ -1466,6 +1513,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .v_w2_loop
|
|
RET
|
|
.v_w4:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_32
|
|
shl wd, 14
|
|
%if STACK_ALIGNMENT < 16
|
|
@@ -1604,6 +1652,7 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .v_w4_loop0
|
|
RET
|
|
.hv:
|
|
+ _CET_ENDBR
|
|
%if STACK_ALIGNMENT < 16
|
|
%xdefine rstk rsp
|
|
%else
|
|
@@ -1741,8 +1790,10 @@ cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w,
|
|
jg .hv_w2_loop
|
|
RET
|
|
.hv_w8:
|
|
+ _CET_ENDBR
|
|
shr mxd, 16
|
|
.hv_w4:
|
|
+ _CET_ENDBR
|
|
movq m2, [base+subpel_filters+mxq*8]
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
@@ -2060,6 +2111,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
|
|
%endif
|
|
jmp wq
|
|
.h:
|
|
+ _CET_ENDBR
|
|
test myd, 0xf00
|
|
jnz .hv
|
|
movifnidn ssq, r2mp
|
|
@@ -2107,6 +2159,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
|
|
jg .h_w4_loop
|
|
RET
|
|
.h_w8:
|
|
+ _CET_ENDBR
|
|
WIN64_SPILL_XMM 11
|
|
shr mxd, 16
|
|
movq m2, [base+subpel_filters+mxq*8]
|
|
@@ -2177,6 +2230,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
|
|
jg .h_w8_loop0
|
|
RET
|
|
.v:
|
|
+ _CET_ENDBR
|
|
movzx mxd, myb
|
|
shr myd, 16
|
|
cmp hd, 4
|
|
@@ -2339,6 +2393,7 @@ cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h,
|
|
jg .v_loop0
|
|
RET
|
|
.hv:
|
|
+ _CET_ENDBR
|
|
%if STACK_ALIGNMENT < 16
|
|
%xdefine rstk rsp
|
|
%else
|
|
@@ -3008,6 +3063,7 @@ cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, s
|
|
jmp wq
|
|
%if isput
|
|
.w2:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
movzx t0d, t0b
|
|
@@ -3259,6 +3315,7 @@ cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, s
|
|
%endif
|
|
INIT_XMM ssse3
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
mova [rsp+0x10], m11
|
|
@@ -3695,22 +3752,27 @@ INIT_XMM ssse3
|
|
%define stk rsp+0x20
|
|
%endif
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 1
|
|
movifprep tmp_stridem, 16
|
|
jmp .w_start
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 2
|
|
movifprep tmp_stridem, 32
|
|
jmp .w_start
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 4
|
|
movifprep tmp_stridem, 64
|
|
jmp .w_start
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 8
|
|
movifprep tmp_stridem, 128
|
|
jmp .w_start
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 16
|
|
movifprep tmp_stridem, 256
|
|
.w_start:
|
|
@@ -4277,6 +4339,7 @@ INIT_XMM ssse3
|
|
jmp wq
|
|
%if isput
|
|
.dy1_w2:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
movzx t0d, t0b
|
|
@@ -4477,6 +4540,7 @@ INIT_XMM ssse3
|
|
%endif
|
|
INIT_XMM ssse3
|
|
.dy1_w4:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
mova [rsp+0x10], m11
|
|
@@ -4857,22 +4921,27 @@ INIT_XMM ssse3
|
|
MC_8TAP_SCALED_RET ; why not jz .ret?
|
|
INIT_XMM ssse3
|
|
.dy1_w8:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 1
|
|
movifprep tmp_stridem, 16
|
|
jmp .dy1_w_start
|
|
.dy1_w16:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 2
|
|
movifprep tmp_stridem, 32
|
|
jmp .dy1_w_start
|
|
.dy1_w32:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 4
|
|
movifprep tmp_stridem, 64
|
|
jmp .dy1_w_start
|
|
.dy1_w64:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 8
|
|
movifprep tmp_stridem, 128
|
|
jmp .dy1_w_start
|
|
.dy1_w128:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 16
|
|
movifprep tmp_stridem, 256
|
|
.dy1_w_start:
|
|
@@ -5395,11 +5464,13 @@ INIT_XMM ssse3
|
|
%define stk rsp+0x20
|
|
%endif
|
|
.dy2:
|
|
+ _CET_ENDBR
|
|
movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
|
|
add wq, base_reg
|
|
jmp wq
|
|
%if isput
|
|
.dy2_w2:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
mova [rsp+0x10], m13
|
|
@@ -5609,6 +5680,7 @@ INIT_XMM ssse3
|
|
%endif
|
|
INIT_XMM ssse3
|
|
.dy2_w4:
|
|
+ _CET_ENDBR
|
|
%if ARCH_X86_64
|
|
mov myd, mym
|
|
mova [rsp+0x10], m11
|
|
@@ -5946,22 +6018,27 @@ INIT_XMM ssse3
|
|
MC_8TAP_SCALED_RET ; why not jz .ret?
|
|
INIT_XMM ssse3
|
|
.dy2_w8:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 1
|
|
movifprep tmp_stridem, 16
|
|
jmp .dy2_w_start
|
|
.dy2_w16:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 2
|
|
movifprep tmp_stridem, 32
|
|
jmp .dy2_w_start
|
|
.dy2_w32:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 4
|
|
movifprep tmp_stridem, 64
|
|
jmp .dy2_w_start
|
|
.dy2_w64:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 8
|
|
movifprep tmp_stridem, 128
|
|
jmp .dy2_w_start
|
|
.dy2_w128:
|
|
+ _CET_ENDBR
|
|
mov dword [stk+0xf0], 16
|
|
movifprep tmp_stridem, 256
|
|
.dy2_w_start:
|
|
@@ -6812,6 +6889,7 @@ ALIGN function_align
|
|
ret
|
|
ALIGN function_align
|
|
.h:
|
|
+ _CET_ENDBR
|
|
lea tmpd, [mxq+alphaq]
|
|
shr mxd, 10
|
|
movq m3, [filterq+mxq*8]
|
|
@@ -6890,6 +6968,7 @@ ALIGN function_align
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
@@ -6903,6 +6982,7 @@ ALIGN function_align
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
@@ -6912,6 +6992,7 @@ ALIGN function_align
|
|
call .main
|
|
add dstq, strideq
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
dec hd
|
|
@@ -6921,6 +7002,7 @@ ALIGN function_align
|
|
call .main
|
|
add dstq, strideq
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -6933,6 +7015,7 @@ ALIGN function_align
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -6951,6 +7034,7 @@ ALIGN function_align
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16* 0], m0
|
|
mova [dstq+16* 1], m1
|
|
call .main
|
|
@@ -7174,6 +7258,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 4
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movq [dstq+strideq*0], m0
|
|
phaddw m2, m3
|
|
movhps [dstq+strideq*1], m0
|
|
@@ -7193,6 +7278,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 4
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0], m0
|
|
paddw m2, m3
|
|
phaddw m2, m2
|
|
@@ -7209,6 +7295,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 8
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*1+16*0], m2
|
|
mova [dstq+strideq*0+16*0], m0
|
|
mova [dstq+strideq*1+16*1], m3
|
|
@@ -7231,6 +7318,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 16
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*1+16*0], m2
|
|
mova [dstq+strideq*0+16*0], m0
|
|
mova [dstq+strideq*1+16*1], m3
|
|
@@ -7266,6 +7354,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 16*2
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*1+16*1], m2
|
|
mova [dstq+strideq*0+16*0], m0
|
|
mova [dstq+strideq*1+16*2], m3
|
|
@@ -7330,6 +7419,7 @@ cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
lea dstq, [dstq+strideq*2]
|
|
add maskq, 16*4
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*1+16* 1], m2
|
|
mova [dstq+strideq*0+16* 0], m0
|
|
mova [dstq+strideq*1+16* 2], m3
|
|
@@ -7511,6 +7601,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
@@ -7524,6 +7615,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
@@ -7534,6 +7626,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0+16*0], m0
|
|
mova [dstq+strideq*0+16*1], m1
|
|
call .main
|
|
@@ -7546,6 +7639,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -7558,6 +7652,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -7576,6 +7671,7 @@ cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16* 0], m0
|
|
mova [dstq+16* 1], m1
|
|
call .main
|
|
@@ -7649,6 +7745,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movq [dstq+strideq*0], m0
|
|
movhps [dstq+strideq*1], m0
|
|
lea dstq, [dstq+strideq*2]
|
|
@@ -7662,6 +7759,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m1
|
|
sub hd, 2
|
|
@@ -7672,6 +7770,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
lea dstq, [dstq+strideq*2]
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0+16*0], m0
|
|
mova [dstq+strideq*0+16*1], m1
|
|
call .main
|
|
@@ -7684,6 +7783,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -7696,6 +7796,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16*0], m0
|
|
mova [dstq+16*1], m1
|
|
call .main
|
|
@@ -7714,6 +7815,7 @@ cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1,
|
|
call .main
|
|
add dstq, strideq
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
mova [dstq+16* 0], m0
|
|
mova [dstq+16* 1], m1
|
|
call .main
|
|
@@ -7770,6 +7872,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
|
|
pxor m6, m6
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
mova m5, [maskq]
|
|
movq m0, [dstq+strideq*0]
|
|
movhps m0, [dstq+strideq*1]
|
|
@@ -7796,6 +7899,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
|
|
jg .w4
|
|
RET
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova m5, [maskq]
|
|
mova m0, [dstq+strideq*0]
|
|
mova m1, [dstq+strideq*1]
|
|
@@ -7818,6 +7922,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
|
|
jg .w8
|
|
RET
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova m5, [maskq]
|
|
mova m0, [dstq+16*0]
|
|
mova m1, [dstq+16*1]
|
|
@@ -7840,6 +7945,7 @@ cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h,
|
|
jg .w16
|
|
RET
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
mova m5, [maskq+16*0]
|
|
mova m0, [dstq+16*0]
|
|
mova m1, [dstq+16*1]
|
|
@@ -7886,6 +7992,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
|
|
add wq, r5
|
|
jmp wq
|
|
.w2:
|
|
+ _CET_ENDBR
|
|
movd m4, [base+obmc_masks+2*2]
|
|
.w2_loop:
|
|
movd m0, [dstq+strideq*0]
|
|
@@ -7906,6 +8013,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
|
|
jg .w2_loop
|
|
RET
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movddup m2, [base+obmc_masks+4*2]
|
|
.w4_loop:
|
|
movq m0, [dstq+strideq*0]
|
|
@@ -7922,6 +8030,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
|
|
jg .w4_loop
|
|
RET
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova m4, [base+obmc_masks+8*2]
|
|
.w8_loop:
|
|
mova m0, [dstq+strideq*0]
|
|
@@ -7942,6 +8051,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
|
|
jg .w8_loop
|
|
RET
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova m4, [base+obmc_masks+16*2]
|
|
movq m5, [base+obmc_masks+16*3]
|
|
.w16_loop:
|
|
@@ -7963,6 +8073,7 @@ cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
|
|
jg .w16_loop
|
|
RET
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
movaps [rsp+8], m6
|
|
%endif
|
|
@@ -8030,6 +8141,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
neg hq
|
|
jmp wq
|
|
.w2:
|
|
+ _CET_ENDBR
|
|
movd m0, [dstq+dsq*0]
|
|
movd m2, [dstq+dsq*1]
|
|
movd m3, [maskq+hq*2]
|
|
@@ -8048,6 +8160,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
jl .w2
|
|
RET
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
mova m3, [base+blend_shuf]
|
|
.w4_loop:
|
|
movq m0, [dstq+dsq*0]
|
|
@@ -8066,6 +8179,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
jl .w4_loop
|
|
RET
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movddup m5, [base+blend_shuf+8]
|
|
%if WIN64
|
|
movaps [rsp+ 8], m6
|
|
@@ -8097,6 +8211,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
%endif
|
|
RET
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movd m5, [maskq+hq*2]
|
|
pshufb m5, m4
|
|
BLEND_H_ROW 0, 0, 2
|
|
@@ -8105,6 +8220,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
jl .w16
|
|
RET
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movd m5, [maskq+hq*2]
|
|
pshufb m5, m4
|
|
BLEND_H_ROW 0, 0
|
|
@@ -8114,6 +8230,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
jl .w32
|
|
RET
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
movd m5, [maskq+hq*2]
|
|
pshufb m5, m4
|
|
BLEND_H_ROW 0, 0
|
|
@@ -8125,6 +8242,7 @@ cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, ma
|
|
jl .w64
|
|
RET
|
|
.w128:
|
|
+ _CET_ENDBR
|
|
movd m5, [maskq+hq*2]
|
|
pshufb m5, m4
|
|
BLEND_H_ROW 0, 0
|