1090 lines
31 KiB
Text
1090 lines
31 KiB
Text
|
Index: src/x86/mc_sse.asm
|
||
|
--- src/x86/mc_sse.asm.orig
|
||
|
+++ src/x86/mc_sse.asm
|
||
|
@@ -344,11 +344,13 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
test mxyd, mxyd
|
||
|
jnz .v
|
||
|
.put:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [t0+wq*2+table_offset(put,)]
|
||
|
add wq, t0
|
||
|
RESTORE_DSQ_32 t0
|
||
|
jmp wq
|
||
|
.put_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movzx r4d, word [srcq+ssq*0]
|
||
|
movzx r6d, word [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -359,6 +361,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w2
|
||
|
RET
|
||
|
.put_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mov r4d, [srcq+ssq*0]
|
||
|
mov r6d, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -369,6 +372,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w4
|
||
|
RET
|
||
|
.put_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [srcq+ssq*0]
|
||
|
movq m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -379,6 +383,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w8
|
||
|
RET
|
||
|
.put_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movu m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -389,6 +394,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w16
|
||
|
RET
|
||
|
.put_w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0+16*0]
|
||
|
movu m1, [srcq+ssq*0+16*1]
|
||
|
movu m2, [srcq+ssq*1+16*0]
|
||
|
@@ -403,6 +409,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w32
|
||
|
RET
|
||
|
.put_w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+16*0]
|
||
|
movu m1, [srcq+16*1]
|
||
|
movu m2, [srcq+16*2]
|
||
|
@@ -417,6 +424,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w64
|
||
|
RET
|
||
|
.put_w128:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+16*0]
|
||
|
movu m1, [srcq+16*1]
|
||
|
movu m2, [srcq+16*2]
|
||
|
@@ -439,6 +447,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w128
|
||
|
RET
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
|
||
|
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
|
||
|
imul mxyd, 0x00ff00ff
|
||
|
@@ -456,6 +465,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
movifnidn dsq, dsmp
|
||
|
jmp wq
|
||
|
.h_w2:
|
||
|
+ _CET_ENDBR
|
||
|
pshufd m4, m4, q3120 ; m4 = {1, 0, 2, 1, 5, 4, 6, 5}
|
||
|
.h_w2_loop:
|
||
|
movd m0, [srcq+ssq*0]
|
||
|
@@ -475,6 +485,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w2_loop
|
||
|
RET
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m4, [srcq+ssq*0]
|
||
|
movhps m4, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -490,6 +501,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w4
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movu m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -507,6 +519,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+8*0]
|
||
|
movu m1, [srcq+8*1]
|
||
|
add srcq, ssq
|
||
|
@@ -523,6 +536,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w16
|
||
|
RET
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+mmsize*0+8*0]
|
||
|
movu m1, [srcq+mmsize*0+8*1]
|
||
|
pshufb m0, m4
|
||
|
@@ -549,6 +563,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w32
|
||
|
RET
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -16*3
|
||
|
.h_w64_loop:
|
||
|
movu m0, [srcq+r6+16*3+8*0]
|
||
|
@@ -569,6 +584,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w64
|
||
|
RET
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -16*7
|
||
|
.h_w128_loop:
|
||
|
movu m0, [srcq+r6+16*7+8*0]
|
||
|
@@ -589,6 +605,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .h_w128
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [t0+wq*2+table_offset(put, _bilin_v)]
|
||
|
imul mxyd, 0x00ff00ff
|
||
|
mova m5, [base+pw_2048]
|
||
|
@@ -599,6 +616,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
movifnidn dsq, dsmp
|
||
|
jmp wq
|
||
|
.v_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd m0, [srcq+ssq*0]
|
||
|
.v_w2_loop:
|
||
|
pinsrw m0, [srcq+ssq*1], 1 ; 0 1
|
||
|
@@ -618,6 +636,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .v_w2_loop
|
||
|
RET
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd m0, [srcq+ssq*0]
|
||
|
.v_w4_loop:
|
||
|
movd m2, [srcq+ssq*1]
|
||
|
@@ -639,6 +658,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [srcq+ssq*0]
|
||
|
.v_w8_loop:
|
||
|
movq m2, [srcq+ssq*1]
|
||
|
@@ -687,17 +707,22 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg %%loop
|
||
|
%endmacro
|
||
|
.v_w16:
|
||
|
+ _CET_ENDBR
|
||
|
PUT_BILIN_V_W16
|
||
|
RET
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(7<<16)]
|
||
|
jmp .v_w16gt
|
||
|
.v_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(3<<16)]
|
||
|
jmp .v_w16gt
|
||
|
.v_w32:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(1<<16)]
|
||
|
.v_w16gt:
|
||
|
+ _CET_ENDBR
|
||
|
mov r4, srcq
|
||
|
%if ARCH_X86_64
|
||
|
mov r7, dstq
|
||
|
@@ -722,6 +747,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .v_w16gt
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
|
||
|
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
|
||
|
movzx wd, word [t0+wq*2+table_offset(put, _bilin_hv)]
|
||
|
@@ -735,6 +761,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
punpcklqdq m6, m6
|
||
|
jmp wq
|
||
|
.hv_w2:
|
||
|
+ _CET_ENDBR
|
||
|
RESTORE_DSQ_32 t0
|
||
|
movd m0, [srcq+ssq*0]
|
||
|
punpckldq m0, m0
|
||
|
@@ -769,6 +796,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .hv_w2_loop
|
||
|
RET
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova m4, [base+bilin_h_shuf4]
|
||
|
movddup m0, [srcq+ssq*0]
|
||
|
movifnidn dsq, dsmp
|
||
|
@@ -796,6 +824,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movifnidn dsq, dsmp
|
||
|
pshufb m0, m4
|
||
|
@@ -826,12 +855,15 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .hv_w8_loop
|
||
|
RET
|
||
|
.hv_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(7<<16)]
|
||
|
jmp .hv_w16_start
|
||
|
.hv_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(3<<16)]
|
||
|
jmp .hv_w16_start
|
||
|
.hv_w32:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(1<<16)]
|
||
|
.hv_w16_start:
|
||
|
mov r4, srcq
|
||
|
@@ -841,6 +873,7 @@ cglobal put_bilin_8bpc, 1, 8, 0, dst, ds, src, ss, w,
|
||
|
mov r7, dstq
|
||
|
%endif
|
||
|
.hv_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movifnidn dsq, dsmp
|
||
|
%if WIN64
|
||
|
movaps r4m, m8
|
||
|
@@ -967,6 +1000,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
test mxyd, mxyd
|
||
|
jnz .v
|
||
|
.prep:
|
||
|
+ _CET_ENDBR
|
||
|
%if notcpuflag(ssse3)
|
||
|
add r6, prep_ssse3 - prep_sse2
|
||
|
jmp prep_ssse3
|
||
|
@@ -977,6 +1011,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.prep_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd m0, [srcq+strideq*0]
|
||
|
movd m1, [srcq+strideq*1]
|
||
|
movd m2, [srcq+strideq*2]
|
||
|
@@ -995,6 +1030,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w4
|
||
|
RET
|
||
|
.prep_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [srcq+strideq*0]
|
||
|
movq m1, [srcq+strideq*1]
|
||
|
movq m2, [srcq+strideq*2]
|
||
|
@@ -1017,6 +1053,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w8
|
||
|
RET
|
||
|
.prep_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [srcq+strideq*0]
|
||
|
movu m3, [srcq+strideq*1]
|
||
|
lea srcq, [srcq+strideq*2]
|
||
|
@@ -1037,12 +1074,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w16
|
||
|
RET
|
||
|
.prep_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -128
|
||
|
jmp .prep_w32_start
|
||
|
.prep_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -64
|
||
|
jmp .prep_w32_start
|
||
|
.prep_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -32
|
||
|
.prep_w32_start:
|
||
|
sub srcq, r3
|
||
|
@@ -1072,6 +1112,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
RET
|
||
|
%endif
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
|
||
|
; = (16 - mx) * src[x] + mx * src[x + 1]
|
||
|
%if cpuflag(ssse3)
|
||
|
@@ -1095,6 +1136,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
add wq, r6
|
||
|
jmp wq
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if cpuflag(ssse3)
|
||
|
mova m4, [base+bilin_h_shuf4]
|
||
|
%endif
|
||
|
@@ -1116,6 +1158,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea stride3q, [strideq*3]
|
||
|
.h_w8_loop:
|
||
|
movu m0, [srcq+strideq*0]
|
||
|
@@ -1140,6 +1183,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w8_loop
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+strideq*0+8*0]
|
||
|
movu m1, [srcq+strideq*0+8*1]
|
||
|
movu m2, [srcq+strideq*1+8*0]
|
||
|
@@ -1162,12 +1206,15 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w16
|
||
|
RET
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -128
|
||
|
jmp .h_w32_start
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -64
|
||
|
jmp .h_w32_start
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -32
|
||
|
.h_w32_start:
|
||
|
sub srcq, r3
|
||
|
@@ -1198,6 +1245,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w32_vloop
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
%if notcpuflag(ssse3)
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 8
|
||
|
@@ -1217,6 +1265,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
pshufd m5, m5, q0000
|
||
|
jmp wq
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd m0, [srcq+strideq*0]
|
||
|
.v_w4_loop:
|
||
|
movd m1, [srcq+strideq*1]
|
||
|
@@ -1239,6 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [srcq+strideq*0]
|
||
|
.v_w8_loop:
|
||
|
movq m1, [srcq+strideq*1]
|
||
|
@@ -1263,6 +1313,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w8_loop
|
||
|
RET
|
||
|
.v_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+strideq*0]
|
||
|
.v_w16_loop:
|
||
|
movu m1, [srcq+strideq*1]
|
||
|
@@ -1299,14 +1350,17 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w16_loop
|
||
|
RET
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(3<<8)]
|
||
|
mov r6d, 256
|
||
|
jmp .v_w32_start
|
||
|
.v_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(1<<8)]
|
||
|
mov r6d, 128
|
||
|
jmp .v_w32_start
|
||
|
.v_w32:
|
||
|
+ _CET_ENDBR
|
||
|
xor r3d, r3d
|
||
|
mov r6d, 64
|
||
|
.v_w32_start:
|
||
|
@@ -1372,6 +1426,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
%endif
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
|
||
|
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
|
||
|
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
|
||
|
@@ -1394,6 +1449,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
pshufd m6, m6, q0000
|
||
|
jmp wq
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if cpuflag(ssse3)
|
||
|
mova m4, [base+bilin_h_shuf4]
|
||
|
movddup m0, [srcq+strideq*0]
|
||
|
@@ -1429,6 +1485,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+strideq*0]
|
||
|
PSHUFB_BILIN_H8 m0, m4
|
||
|
PMADDUBSW m0, m5, m7, m4, 0 ; 0
|
||
|
@@ -1454,18 +1511,22 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w8_loop
|
||
|
RET
|
||
|
.hv_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(7<<8)]
|
||
|
mov r5d, 256
|
||
|
jmp .hv_w16_start
|
||
|
.hv_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(3<<8)]
|
||
|
mov r5d, 128
|
||
|
jmp .hv_w16_start
|
||
|
.hv_w32:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(1<<8)]
|
||
|
mov r5d, 64
|
||
|
jmp .hv_w16_start
|
||
|
.hv_w16:
|
||
|
+ _CET_ENDBR
|
||
|
xor r3d, r3d
|
||
|
mov r5d, 32
|
||
|
.hv_w16_start:
|
||
|
@@ -1627,6 +1688,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
lea r6, [ssq*3]
|
||
|
jmp wq
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
test ssd, 0xf00
|
||
|
%else
|
||
|
@@ -1654,6 +1716,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
add wq, base_reg
|
||
|
jmp wq
|
||
|
.h_w2:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
and mxd, 0x7f
|
||
|
%else
|
||
|
@@ -1684,6 +1747,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w2_loop
|
||
|
RET
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
and mxd, 0x7f
|
||
|
%else
|
||
|
@@ -1735,6 +1799,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
psraw %1, 6
|
||
|
%endmacro
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movu m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -1755,15 +1820,19 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r4, -16*7
|
||
|
jmp .h_w16_start
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r4, -16*3
|
||
|
jmp .h_w16_start
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov r4, -16*1
|
||
|
jmp .h_w16_start
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
xor r4d, r4d
|
||
|
.h_w16_start:
|
||
|
sub srcq, r4
|
||
|
@@ -1785,6 +1854,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w16_loop_v
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
movzx mxd, ssb
|
||
|
shr ssd, 16
|
||
|
@@ -1840,6 +1910,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
%endif
|
||
|
jmp r6
|
||
|
.v_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd m1, [srcq+ssq*0]
|
||
|
movd m0, [srcq+ssq*1]
|
||
|
%if ARCH_X86_32
|
||
|
@@ -1895,12 +1966,14 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .v_w2_loop
|
||
|
RET
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
.v_w8:
|
||
|
.v_w16:
|
||
|
.v_w32:
|
||
|
.v_w64:
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
shl wd, 14
|
||
|
%if STACK_ALIGNMENT < 16
|
||
|
%define dstm [rsp+mmsize*4+gprsize]
|
||
|
@@ -1979,6 +2052,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
.v_w32:
|
||
|
.v_w64:
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [wq*8-64]
|
||
|
mov r4, srcq
|
||
|
mov r7, dstq
|
||
|
@@ -2048,6 +2122,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
%undef subpel2
|
||
|
%undef subpel3
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
cmp wd, 4
|
||
|
jg .hv_w8
|
||
|
@@ -2113,6 +2188,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
cmp wd, 4
|
||
|
je .hv_w4
|
||
|
.hv_w2:
|
||
|
+ _CET_ENDBR
|
||
|
mova m6, [base+subpel_h_shuf4]
|
||
|
movq m2, [srcq+ssq*0] ; 0
|
||
|
movhps m2, [srcq+ssq*1] ; 0 _ 1
|
||
|
@@ -2192,6 +2268,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
%undef w8192reg
|
||
|
%undef d512reg
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%define hv4_line_0_0 4
|
||
|
%define hv4_line_0_1 5
|
||
|
%define hv4_line_0_2 6
|
||
|
@@ -2369,6 +2446,7 @@ cglobal put_8tap_8bpc, 1, 9, 0, dst, ds, src, ss, w, h
|
||
|
%undef subpelv2
|
||
|
%undef subpelv3
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
%define hv8_line_1 0
|
||
|
%define hv8_line_2 1
|
||
|
@@ -2869,6 +2947,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
%endif
|
||
|
jmp wq
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
LEA base_reg, prep%+SUFFIX
|
||
|
test myd, 0xf00
|
||
|
jnz .hv
|
||
|
@@ -2918,6 +2997,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
add wq, base_reg
|
||
|
jmp wq
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
and mxd, 0x7f
|
||
|
%else
|
||
|
@@ -3038,6 +3118,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if cpuflag(ssse3)
|
||
|
PREP_8TAP_H 0, srcq+strideq*0
|
||
|
PREP_8TAP_H 1, srcq+strideq*1
|
||
|
@@ -3056,15 +3137,19 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -16*1
|
||
|
jmp .h_start
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -16*2
|
||
|
jmp .h_start
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -16*4
|
||
|
jmp .h_start
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r3, -16*8
|
||
|
.h_start:
|
||
|
sub srcq, r3
|
||
|
@@ -3090,6 +3175,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
jg .h_loop
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
LEA base_reg, prep%+SUFFIX
|
||
|
%if ARCH_X86_32
|
||
|
mov mxd, myd
|
||
|
@@ -3149,6 +3235,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
jns .v_w8
|
||
|
%endif
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if notcpuflag(ssse3)
|
||
|
pxor m6, m6
|
||
|
%if ARCH_X86_64
|
||
|
@@ -3262,6 +3349,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
RET
|
||
|
%if ARCH_X86_64
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [wq*8-64]
|
||
|
mov r5, srcq
|
||
|
mov r8, tmpq
|
||
|
@@ -3359,6 +3447,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
%undef subpel2
|
||
|
%undef subpel3
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
cmp wd, 4
|
||
|
jg .hv_w8
|
||
|
@@ -3659,6 +3748,7 @@ cglobal prep_8tap_8bpc, 1, 9, 0, tmp, src, stride, w,
|
||
|
%undef subpelv2
|
||
|
%undef subpelv3
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset org_stack_offset
|
||
|
%define hv8_line_1 0
|
||
|
%define hv8_line_2 1
|
||
|
@@ -4317,6 +4407,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -4551,6 +4642,7 @@ cglobal prep_8tap_scaled_8bpc, 0, 6, 8, 0x200, tmp, sr
|
||
|
%endif
|
||
|
INIT_XMM ssse3
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -4896,22 +4988,27 @@ INIT_XMM ssse3
|
||
|
jmp .w4_loop
|
||
|
INIT_XMM ssse3
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .w_start
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .w_start
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .w_start
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .w_start
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.w_start:
|
||
|
@@ -5427,11 +5524,13 @@ INIT_XMM ssse3
|
||
|
jmp .vloop
|
||
|
INIT_XMM ssse3
|
||
|
.dy1:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
|
||
|
add wq, base_reg
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.dy1_w2:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -5606,6 +5705,7 @@ INIT_XMM ssse3
|
||
|
%endif
|
||
|
INIT_XMM ssse3
|
||
|
.dy1_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -5940,22 +6040,27 @@ INIT_XMM ssse3
|
||
|
jmp .dy1_w4_loop
|
||
|
INIT_XMM ssse3
|
||
|
.dy1_w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.dy1_w_start:
|
||
|
@@ -6406,11 +6511,13 @@ INIT_XMM ssse3
|
||
|
jmp .dy1_vloop
|
||
|
INIT_XMM ssse3
|
||
|
.dy2:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
|
||
|
add wq, base_reg
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.dy2_w2:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -6592,6 +6699,7 @@ INIT_XMM ssse3
|
||
|
%endif
|
||
|
INIT_XMM ssse3
|
||
|
.dy2_w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_64
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
@@ -6842,22 +6950,27 @@ INIT_XMM ssse3
|
||
|
MC_8TAP_SCALED_RET
|
||
|
INIT_XMM ssse3
|
||
|
.dy2_w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+0x90], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.dy2_w_start:
|
||
|
@@ -7852,6 +7965,7 @@ ALIGN function_align
|
||
|
ret
|
||
|
ALIGN function_align
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
%define m8 m3
|
||
|
%define m9 m4
|
||
|
@@ -8022,6 +8136,7 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
.w4: ; tile 4x
|
||
|
+ _CET_ENDBR
|
||
|
movd [dstq ], m0 ; copy dw[0]
|
||
|
pshuflw m1, m0, q1032 ; swap dw[1] and dw[0]
|
||
|
movd [dstq+strideq*1], m1 ; copy dw[1]
|
||
|
@@ -8037,6 +8152,7 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq [dstq ], m0
|
||
|
movhps [dstq+strideq*1], m0
|
||
|
sub hd, 2
|
||
|
@@ -8047,6 +8163,7 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq]
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq ], m0
|
||
|
dec hd
|
||
|
jg .w16_loop
|
||
|
@@ -8056,6 +8173,7 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq]
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq ], m0
|
||
|
%1 2
|
||
|
mova [dstq + 16 ], m0
|
||
|
@@ -8067,6 +8185,8 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
add dstq, strideq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
+ _CET_ENDBR
|
||
|
%assign i 0
|
||
|
%rep 4
|
||
|
mova [dstq + i*16 ], m0
|
||
|
@@ -8083,6 +8203,7 @@ DECLARE_REG_TMP 6, 7
|
||
|
%1 0
|
||
|
add dstq, strideq
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
%assign i 0
|
||
|
%rep 8
|
||
|
mova [dstq + i*16 ], m0
|
||
|
@@ -8264,6 +8385,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 4
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
pshufd m3, m2, q2020
|
||
|
pshufd m2, m2, q3131
|
||
|
psubw m1, m7, m3
|
||
|
@@ -8287,6 +8409,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 4
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movhlps m3, m2
|
||
|
psubw m1, m7, m2
|
||
|
psubw m1, m3
|
||
|
@@ -8303,6 +8426,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 8
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*1], m2
|
||
|
mova [dstq+strideq*0], m0
|
||
|
call .main
|
||
|
@@ -8320,6 +8444,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 16
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [maskq], m2
|
||
|
mova [dstq+strideq*0+16*0], m0
|
||
|
call .main
|
||
|
@@ -8334,6 +8459,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 16*2
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mova [maskq+16*0], m2
|
||
|
mova [dstq+strideq*0+16*0], m0
|
||
|
call .main
|
||
|
@@ -8354,6 +8480,7 @@ cglobal w_mask_420_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
add maskq, 16*4
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
mova [maskq+16*0], m2
|
||
|
mova [dstq+strideq*0+16*0], m0
|
||
|
call .main
|
||
|
@@ -8457,6 +8584,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 8
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
packuswb m2, m2
|
||
|
psubb m1, m7, m2
|
||
|
%if ARCH_X86_64
|
||
|
@@ -8482,6 +8610,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 16
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK_422_BACKUP 0
|
||
|
movq [dstq+strideq*0], m0
|
||
|
movhps [dstq+strideq*1], m0
|
||
|
@@ -8498,6 +8627,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 16
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK_422_BACKUP 0
|
||
|
mova [dstq+strideq*0], m0
|
||
|
call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
|
||
|
@@ -8511,6 +8641,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 16
|
||
|
add dstq, strideq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK_422_BACKUP 0
|
||
|
mova [dstq+16*0], m0
|
||
|
call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
|
||
|
@@ -8524,6 +8655,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 16*2
|
||
|
add dstq, strideq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK_422_BACKUP 0
|
||
|
mova [dstq+16*0], m0
|
||
|
call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
|
||
|
@@ -8543,6 +8675,7 @@ cglobal w_mask_422_8bpc, 4, 7, 11, dst, stride, tmp1,
|
||
|
add maskq, 16*4
|
||
|
add dstq, strideq
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK_422_BACKUP 0
|
||
|
mova [dstq+16*0], m0
|
||
|
call mangle(private_prefix %+ _w_mask_420_8bpc_ssse3).main
|
||
|
@@ -8593,6 +8726,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd [dstq+strideq*0], m0
|
||
|
pshuflw m1, m0, q1032
|
||
|
movd [dstq+strideq*1], m1
|
||
|
@@ -8608,6 +8742,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq [dstq+strideq*0], m0
|
||
|
movhps [dstq+strideq*1], m0
|
||
|
sub hd, 2
|
||
|
@@ -8617,6 +8752,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], m0
|
||
|
call .main
|
||
|
mova [dstq+strideq*1], m0
|
||
|
@@ -8627,6 +8763,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
add dstq, strideq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+16*0], m0
|
||
|
call .main
|
||
|
mova [dstq+16*1], m0
|
||
|
@@ -8637,6 +8774,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
add dstq, strideq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+16*0], m0
|
||
|
call .main
|
||
|
mova [dstq+16*1], m0
|
||
|
@@ -8651,6 +8789,7 @@ cglobal w_mask_444_8bpc, 4, 7, 9, dst, stride, tmp1, t
|
||
|
call .main
|
||
|
add dstq, strideq
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+16*0], m0
|
||
|
call .main
|
||
|
mova [dstq+16*1], m0
|
||
|
@@ -8729,6 +8868,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
lea r6, [dsq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [maskq]; m
|
||
|
movd m1, [dstq+dsq*0] ; a
|
||
|
movd m6, [dstq+dsq*1]
|
||
|
@@ -8750,6 +8890,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
jg .w4
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [maskq]; m
|
||
|
movq m1, [dstq+dsq*0] ; a
|
||
|
movhps m1, [dstq+dsq*1]
|
||
|
@@ -8764,6 +8905,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
jg .w8
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [maskq]; m
|
||
|
mova m1, [dstq] ; a
|
||
|
mova m6, [tmpq] ; b
|
||
|
@@ -8776,6 +8918,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
jg .w16
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
%assign i 0
|
||
|
%rep 2
|
||
|
mova m0, [maskq+16*i]; m
|
||
|
@@ -8803,6 +8946,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
add maskq, obmc_masks-blend_v_ssse3_table
|
||
|
jmp wq
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd m3, [maskq+4]
|
||
|
punpckldq m3, m3
|
||
|
; 2 mask blend is provided for 4 pixels / 2 lines
|
||
|
@@ -8824,6 +8968,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
jg .w2_loop
|
||
|
RET
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movddup m3, [maskq+8]
|
||
|
; 4 mask blend is provided for 8 pixels / 2 lines
|
||
|
.w4_loop:
|
||
|
@@ -8844,6 +8989,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
jg .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mova m3, [maskq+16]
|
||
|
; 8 mask blend is provided for 16 pixels
|
||
|
.w8_loop:
|
||
|
@@ -8859,6 +9005,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
jg .w8_loop
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
; 16 mask blend is provided for 32 pixels
|
||
|
mova m3, [maskq+32] ; obmc_masks_16[0] (64-m[0])
|
||
|
mova m4, [maskq+48] ; obmc_masks_16[1] (64-m[1])
|
||
|
@@ -8873,6 +9020,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
jg .w16_loop
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
%if WIN64
|
||
|
mova [rsp+8], xmm6
|
||
|
%endif
|
||
|
@@ -8922,6 +9070,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
neg hq
|
||
|
jmp wq
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd m0, [dstq+dsq*0]
|
||
|
pinsrw m0, [dstq+dsq*1], 1
|
||
|
movd m2, [maskq+hq*2]
|
||
|
@@ -8941,6 +9090,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
jl .w2
|
||
|
RET
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
mova m3, [base+blend_shuf]
|
||
|
%else
|
||
|
@@ -8966,6 +9116,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
jl .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movd m4, [maskq+hq*2]
|
||
|
punpcklwd m4, m4
|
||
|
pshufd m3, m4, q0000
|
||
|
@@ -8983,6 +9134,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
RET
|
||
|
; w16/w32/w64/w128
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
mov r6d, wm
|
||
|
%endif
|