1170 lines
35 KiB
Text
1170 lines
35 KiB
Text
|
Index: src/x86/mc_avx2.asm
|
||
|
--- src/x86/mc_avx2.asm.orig
|
||
|
+++ src/x86/mc_avx2.asm
|
||
|
@@ -212,10 +212,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
||
|
test mxyd, mxyd
|
||
|
jnz .v
|
||
|
.put:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [r7+wq*2+table_offset(put,)]
|
||
|
add wq, r7
|
||
|
jmp wq
|
||
|
.put_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movzx r6d, word [srcq+ssq*0]
|
||
|
movzx r7d, word [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -226,6 +228,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w2
|
||
|
RET
|
||
|
.put_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6d, [srcq+ssq*0]
|
||
|
mov r7d, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -236,6 +239,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w4
|
||
|
RET
|
||
|
.put_w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, [srcq+ssq*0]
|
||
|
mov r7, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -246,6 +250,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
||
|
jg .put_w8
|
||
|
RET
|
||
|
.put_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movu m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -257,6 +262,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
||
|
RET
|
||
|
INIT_YMM avx2
|
||
|
.put_w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
movu m1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -267,6 +273,7 @@ INIT_YMM avx2
|
||
|
jg .put_w32
|
||
|
RET
|
||
|
.put_w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0+32*0]
|
||
|
movu m1, [srcq+ssq*0+32*1]
|
||
|
movu m2, [srcq+ssq*1+32*0]
|
||
|
@@ -281,6 +288,7 @@ INIT_YMM avx2
|
||
|
jg .put_w64
|
||
|
RET
|
||
|
.put_w128:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+32*0]
|
||
|
movu m1, [srcq+32*1]
|
||
|
movu m2, [srcq+32*2]
|
||
|
@@ -295,6 +303,7 @@ INIT_YMM avx2
|
||
|
jg .put_w128
|
||
|
RET
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
|
||
|
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
|
||
|
imul mxyd, 255
|
||
|
@@ -310,6 +319,7 @@ INIT_YMM avx2
|
||
|
add wq, r7
|
||
|
jmp wq
|
||
|
.h_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+ssq*0]
|
||
|
pinsrd xm0, [srcq+ssq*1], 1
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -324,6 +334,7 @@ INIT_YMM avx2
|
||
|
jg .h_w2
|
||
|
RET
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm4, [bilin_h_shuf4]
|
||
|
.h_w4_loop:
|
||
|
movq xm0, [srcq+ssq*0]
|
||
|
@@ -340,6 +351,7 @@ INIT_YMM avx2
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+ssq*0]
|
||
|
movu xm1, [srcq+ssq*1]
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
@@ -357,6 +369,7 @@ INIT_YMM avx2
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+ssq*0+8*0]
|
||
|
vinserti128 m0, [srcq+ssq*1+8*0], 1
|
||
|
movu xm1, [srcq+ssq*0+8*1]
|
||
|
@@ -376,6 +389,7 @@ INIT_YMM avx2
|
||
|
jg .h_w16
|
||
|
RET
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+8*0]
|
||
|
movu m1, [srcq+8*1]
|
||
|
add srcq, ssq
|
||
|
@@ -392,6 +406,7 @@ INIT_YMM avx2
|
||
|
jg .h_w32
|
||
|
RET
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+8*0]
|
||
|
movu m1, [srcq+8*1]
|
||
|
pshufb m0, m4
|
||
|
@@ -418,6 +433,7 @@ INIT_YMM avx2
|
||
|
jg .h_w64
|
||
|
RET
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -32*3
|
||
|
.h_w128_loop:
|
||
|
movu m0, [srcq+r6+32*3+8*0]
|
||
|
@@ -438,6 +454,7 @@ INIT_YMM avx2
|
||
|
jg .h_w128
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
|
||
|
imul mxyd, 255
|
||
|
vpbroadcastd m5, [pw_2048]
|
||
|
@@ -447,6 +464,7 @@ INIT_YMM avx2
|
||
|
vpbroadcastw m4, xm4
|
||
|
jmp wq
|
||
|
.v_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+ssq*0]
|
||
|
.v_w2_loop:
|
||
|
pinsrw xm1, xm0, [srcq+ssq*1], 1 ; 0 1
|
||
|
@@ -464,6 +482,7 @@ INIT_YMM avx2
|
||
|
jg .v_w2_loop
|
||
|
RET
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+ssq*0]
|
||
|
.v_w4_loop:
|
||
|
vpbroadcastd xm2, [srcq+ssq*1]
|
||
|
@@ -482,6 +501,7 @@ INIT_YMM avx2
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm0, [srcq+ssq*0]
|
||
|
.v_w8_loop:
|
||
|
movq xm2, [srcq+ssq*1]
|
||
|
@@ -501,6 +521,7 @@ INIT_YMM avx2
|
||
|
jg .v_w8_loop
|
||
|
RET
|
||
|
.v_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+ssq*0]
|
||
|
.v_w16_loop:
|
||
|
vbroadcasti128 m3, [srcq+ssq*1]
|
||
|
@@ -522,6 +543,7 @@ INIT_YMM avx2
|
||
|
jg .v_w16_loop
|
||
|
RET
|
||
|
.v_w32:
|
||
|
+ _CET_ENDBR
|
||
|
%macro PUT_BILIN_V_W32 0
|
||
|
movu m0, [srcq+ssq*0]
|
||
|
%%loop:
|
||
|
@@ -551,6 +573,7 @@ INIT_YMM avx2
|
||
|
PUT_BILIN_V_W32
|
||
|
RET
|
||
|
.v_w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+32*0]
|
||
|
movu m1, [srcq+32*1]
|
||
|
.v_w64_loop:
|
||
|
@@ -580,6 +603,7 @@ INIT_YMM avx2
|
||
|
jg .v_w64_loop
|
||
|
RET
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(3<<8)]
|
||
|
mov r4, srcq
|
||
|
mov r7, dstq
|
||
|
@@ -594,6 +618,7 @@ INIT_YMM avx2
|
||
|
jg .v_w128_loop
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
|
||
|
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
|
||
|
movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
|
||
|
@@ -606,6 +631,7 @@ INIT_YMM avx2
|
||
|
vpbroadcastw m6, xm6
|
||
|
jmp wq
|
||
|
.hv_w2:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd xm0, [srcq+ssq*0]
|
||
|
pshufb xm0, xm4
|
||
|
pmaddubsw xm0, xm5
|
||
|
@@ -630,6 +656,7 @@ INIT_YMM avx2
|
||
|
jg .hv_w2_loop
|
||
|
RET
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm4, [bilin_h_shuf4]
|
||
|
movddup xm0, [srcq+ssq*0]
|
||
|
pshufb xm0, xm4
|
||
|
@@ -655,6 +682,7 @@ INIT_YMM avx2
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m0, [srcq+ssq*0]
|
||
|
pshufb m0, m4
|
||
|
pmaddubsw m0, m5
|
||
|
@@ -680,6 +708,7 @@ INIT_YMM avx2
|
||
|
jg .hv_w8_loop
|
||
|
RET
|
||
|
.hv_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m0, [srcq+ssq*0+8*0]
|
||
|
vinserti128 m0, [srcq+ssq*0+8*1], 1
|
||
|
pshufb m0, m4
|
||
|
@@ -713,14 +742,17 @@ INIT_YMM avx2
|
||
|
jg .hv_w16_loop
|
||
|
RET
|
||
|
.hv_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(3<<16)]
|
||
|
jmp .hv_w32_start
|
||
|
.hv_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(1<<16)]
|
||
|
.hv_w32_start:
|
||
|
mov r4, srcq
|
||
|
mov r7, dstq
|
||
|
.hv_w32:
|
||
|
+ _CET_ENDBR
|
||
|
%if WIN64
|
||
|
movaps r4m, xmm8
|
||
|
%endif
|
||
|
@@ -784,6 +816,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.prep_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+strideq*0]
|
||
|
pinsrd xm0, [srcq+strideq*1], 1
|
||
|
pinsrd xm0, [srcq+strideq*2], 2
|
||
|
@@ -797,6 +830,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w4
|
||
|
RET
|
||
|
.prep_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm0, [srcq+strideq*0]
|
||
|
movhps xm0, [srcq+strideq*1]
|
||
|
movq xm1, [srcq+strideq*2]
|
||
|
@@ -813,6 +847,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w8
|
||
|
RET
|
||
|
.prep_w16:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m0, [srcq+strideq*0]
|
||
|
pmovzxbw m1, [srcq+strideq*1]
|
||
|
pmovzxbw m2, [srcq+strideq*2]
|
||
|
@@ -831,6 +866,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w16
|
||
|
RET
|
||
|
.prep_w32:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m0, [srcq+strideq*0+16*0]
|
||
|
pmovzxbw m1, [srcq+strideq*0+16*1]
|
||
|
pmovzxbw m2, [srcq+strideq*1+16*0]
|
||
|
@@ -849,6 +885,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w32
|
||
|
RET
|
||
|
.prep_w64:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m0, [srcq+16*0]
|
||
|
pmovzxbw m1, [srcq+16*1]
|
||
|
pmovzxbw m2, [srcq+16*2]
|
||
|
@@ -867,6 +904,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w64
|
||
|
RET
|
||
|
.prep_w128:
|
||
|
+ _CET_ENDBR
|
||
|
pmovzxbw m0, [srcq+16*0]
|
||
|
pmovzxbw m1, [srcq+16*1]
|
||
|
pmovzxbw m2, [srcq+16*2]
|
||
|
@@ -897,6 +935,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .prep_w128
|
||
|
RET
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
|
||
|
; = (16 - mx) * src[x] + mx * src[x + 1]
|
||
|
imul mxyd, 255
|
||
|
@@ -912,6 +951,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m4, [bilin_h_shuf4]
|
||
|
.h_w4_loop:
|
||
|
movq xm0, [srcq+strideq*0]
|
||
|
@@ -928,6 +968,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
.h_w8_loop:
|
||
|
movu xm0, [srcq+strideq*0]
|
||
|
vinserti128 m0, [srcq+strideq*1], 1
|
||
|
@@ -945,6 +986,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w8_loop
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
.h_w16_loop:
|
||
|
movu xm0, [srcq+strideq*0+8*0]
|
||
|
vinserti128 m0, [srcq+strideq*0+8*1], 1
|
||
|
@@ -972,6 +1014,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w16_loop
|
||
|
RET
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
.h_w32_loop:
|
||
|
movu xm0, [srcq+strideq*0+8*0]
|
||
|
vinserti128 m0, [srcq+strideq*0+8*1], 1
|
||
|
@@ -999,6 +1042,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w32_loop
|
||
|
RET
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+8*0]
|
||
|
vinserti128 m0, [srcq+8*1], 1
|
||
|
movu xm1, [srcq+8*2]
|
||
|
@@ -1025,6 +1069,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w64
|
||
|
RET
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+8*0]
|
||
|
vinserti128 m0, [srcq+8*1], 1
|
||
|
movu xm1, [srcq+8*2]
|
||
|
@@ -1071,6 +1116,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .h_w128
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 7
|
||
|
movzx wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
|
||
|
imul mxyd, 255
|
||
|
@@ -1081,6 +1127,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
vpbroadcastw m6, xm6
|
||
|
jmp wq
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+strideq*0]
|
||
|
.v_w4_loop:
|
||
|
vpbroadcastd m1, [srcq+strideq*2]
|
||
|
@@ -1101,6 +1148,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm0, [srcq+strideq*0]
|
||
|
.v_w8_loop:
|
||
|
vpbroadcastq m1, [srcq+strideq*2]
|
||
|
@@ -1124,6 +1172,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w8_loop
|
||
|
RET
|
||
|
.v_w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m0, [srcq+strideq*0]
|
||
|
.v_w16_loop:
|
||
|
vbroadcasti128 m1, [srcq+strideq*1]
|
||
|
@@ -1151,6 +1200,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w16_loop
|
||
|
RET
|
||
|
.v_w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, [srcq+strideq*0], q3120
|
||
|
.v_w32_loop:
|
||
|
vpermq m1, [srcq+strideq*1], q3120
|
||
|
@@ -1187,6 +1237,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w32_loop
|
||
|
RET
|
||
|
.v_w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, [srcq+strideq*0+32*0], q3120
|
||
|
vpermq m1, [srcq+strideq*0+32*1], q3120
|
||
|
.v_w64_loop:
|
||
|
@@ -1224,6 +1275,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w64_loop
|
||
|
RET
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [hq+(3<<8)]
|
||
|
mov r3, srcq
|
||
|
mov r5, tmpq
|
||
|
@@ -1257,6 +1309,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .v_w128_loop0
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
|
||
|
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
@@ -1269,6 +1322,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m4, [bilin_h_shuf4]
|
||
|
vpbroadcastq m0, [srcq+strideq*0]
|
||
|
pshufb m0, m4
|
||
|
@@ -1294,6 +1348,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m0, [srcq+strideq*0]
|
||
|
pshufb m0, m4
|
||
|
pmaddubsw m0, m5
|
||
|
@@ -1322,6 +1377,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w8_loop
|
||
|
RET
|
||
|
.hv_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+strideq*0+8*0]
|
||
|
vinserti128 m0, [srcq+strideq*0+8*1], 1
|
||
|
pshufb m0, m4
|
||
|
@@ -1349,6 +1405,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w16_loop
|
||
|
RET
|
||
|
.hv_w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+8*0]
|
||
|
vinserti128 m0, [srcq+8*1], 1
|
||
|
movu xm1, [srcq+8*2]
|
||
|
@@ -1382,10 +1439,12 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
||
|
jg .hv_w32_loop
|
||
|
RET
|
||
|
.hv_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(7<<8)]
|
||
|
mov r6d, 256
|
||
|
jmp .hv_w64_start
|
||
|
.hv_w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+(3<<8)]
|
||
|
mov r6d, 128
|
||
|
.hv_w64_start:
|
||
|
@@ -1489,6 +1548,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
%endif
|
||
|
jmp wq
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
test myd, 0xf00
|
||
|
jnz .hv
|
||
|
vpbroadcastd m5, [pw_34] ; 2 + (8 << 2)
|
||
|
@@ -1508,6 +1568,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
add wq, r8
|
||
|
jmp wq
|
||
|
.h_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movzx mxd, mxb
|
||
|
dec srcq
|
||
|
mova xm4, [subpel_h_shuf4]
|
||
|
@@ -1529,6 +1590,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w2_loop
|
||
|
RET
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movzx mxd, mxb
|
||
|
dec srcq
|
||
|
vpbroadcastd xm3, [r8+mxq*8+subpel_filters-put_avx2+2]
|
||
|
@@ -1551,6 +1613,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
%macro PUT_8TAP_H 4 ; dst/src, tmp[1-3]
|
||
|
pshufb m%2, m%1, m7
|
||
|
pshufb m%3, m%1, m8
|
||
|
@@ -1578,6 +1641,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+ssq*0+8*0]
|
||
|
vinserti128 m0, [srcq+ssq*1+8*0], 1
|
||
|
movu xm1, [srcq+ssq*0+8*1]
|
||
|
@@ -1593,12 +1657,15 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_w16
|
||
|
RET
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
xor r6d, r6d
|
||
|
jmp .h_start
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -32*1
|
||
|
jmp .h_start
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -32*3
|
||
|
.h_start:
|
||
|
sub srcq, r6
|
||
|
@@ -1620,6 +1687,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .h_loop
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 16
|
||
|
movzx mxd, myb
|
||
|
@@ -1639,6 +1707,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
sub srcq, ss3q
|
||
|
jmp r6
|
||
|
.v_w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm2, [srcq+ssq*0]
|
||
|
pinsrw xm2, [srcq+ssq*1], 2
|
||
|
pinsrw xm2, [srcq+ssq*2], 4
|
||
|
@@ -1679,6 +1748,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .v_w2_loop
|
||
|
RET
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm2, [srcq+ssq*0]
|
||
|
pinsrd xm2, [srcq+ssq*1], 1
|
||
|
pinsrd xm2, [srcq+ssq*2], 2
|
||
|
@@ -1719,6 +1789,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm1, [srcq+ssq*0]
|
||
|
vpbroadcastq m4, [srcq+ssq*1]
|
||
|
vpbroadcastq m2, [srcq+ssq*2]
|
||
|
@@ -1766,6 +1837,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
.v_w32:
|
||
|
.v_w64:
|
||
|
.v_w128:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [wq*8-128]
|
||
|
mov r4, srcq
|
||
|
mov r7, dstq
|
||
|
@@ -1834,6 +1906,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .v_w16_loop0
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 16
|
||
|
cmp wd, 4
|
||
|
@@ -1914,6 +1987,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .hv_w2_loop
|
||
|
RET
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova m6, [subpel_h_shuf4]
|
||
|
vpbroadcastq m2, [srcq+ssq*0]
|
||
|
vpbroadcastq m4, [srcq+ssq*1]
|
||
|
@@ -1978,6 +2052,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
shr mxd, 16
|
||
|
sub srcq, 3
|
||
|
vpbroadcastd m10, [r8+mxq*8+subpel_filters-put_avx2+0]
|
||
|
@@ -2153,6 +2228,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
%endif
|
||
|
jmp wq
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
test myd, 0xf00
|
||
|
jnz .hv
|
||
|
vpbroadcastd m4, [pw_8192]
|
||
|
@@ -2171,6 +2247,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
add wq, r7
|
||
|
jmp wq
|
||
|
.h_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movzx mxd, mxb
|
||
|
dec srcq
|
||
|
vpbroadcastd m6, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
|
||
|
@@ -2195,6 +2272,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .h_w4_loop
|
||
|
RET
|
||
|
.h_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+strideq*0]
|
||
|
vinserti128 m0, [srcq+strideq*1], 1
|
||
|
lea srcq, [srcq+strideq*2]
|
||
|
@@ -2205,6 +2283,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .h_w8
|
||
|
RET
|
||
|
.h_w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xm0, [srcq+strideq*0+8*0]
|
||
|
vinserti128 m0, [srcq+strideq*0+8*1], 1
|
||
|
PREP_8TAP_H
|
||
|
@@ -2219,12 +2298,15 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .h_w16
|
||
|
RET
|
||
|
.h_w32:
|
||
|
+ _CET_ENDBR
|
||
|
xor r6d, r6d
|
||
|
jmp .h_start
|
||
|
.h_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -32*1
|
||
|
jmp .h_start
|
||
|
.h_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov r6, -32*3
|
||
|
.h_start:
|
||
|
sub srcq, r6
|
||
|
@@ -2247,6 +2329,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .h_loop
|
||
|
RET
|
||
|
.v:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
WIN64_SPILL_XMM 16
|
||
|
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
|
||
|
@@ -2266,6 +2349,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .v_w16
|
||
|
je .v_w8
|
||
|
.v_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [srcq+strideq*0]
|
||
|
vpbroadcastd m1, [srcq+strideq*2]
|
||
|
vpbroadcastd xm2, [srcq+strideq*1]
|
||
|
@@ -2310,6 +2394,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .v_w4_loop
|
||
|
RET
|
||
|
.v_w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm1, [srcq+strideq*0]
|
||
|
vpbroadcastq m4, [srcq+strideq*1]
|
||
|
vpbroadcastq m2, [srcq+strideq*2]
|
||
|
@@ -2363,6 +2448,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .v_w8_loop
|
||
|
RET
|
||
|
.v_w16:
|
||
|
+ _CET_ENDBR
|
||
|
add wd, wd
|
||
|
mov r5, srcq
|
||
|
mov r7, tmpq
|
||
|
@@ -2430,6 +2516,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .v_w16_loop0
|
||
|
RET
|
||
|
.hv:
|
||
|
+ _CET_ENDBR
|
||
|
%assign stack_offset stack_offset - stack_size_padded
|
||
|
%assign stack_size_padded 0
|
||
|
WIN64_SPILL_XMM 16
|
||
|
@@ -2454,6 +2541,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
pshufd m15, m0, q3333
|
||
|
jmp .hv_w8
|
||
|
.hv_w4:
|
||
|
+ _CET_ENDBR
|
||
|
movzx mxd, mxb
|
||
|
dec srcq
|
||
|
vpbroadcastd m8, [r7+mxq*8+subpel_filters-prep%+SUFFIX+2]
|
||
|
@@ -2547,6 +2635,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
||
|
jg .hv_w4_loop
|
||
|
RET
|
||
|
.hv_w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r6d, [wq*8-64]
|
||
|
mov r5, srcq
|
||
|
mov r7, tmpq
|
||
|
@@ -2794,6 +2883,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
dec srcq
|
||
|
@@ -2904,6 +2994,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
jmp .w2_loop
|
||
|
%endif
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
vbroadcasti128 m7, [base+rescale_mul]
|
||
|
movzx t0d, t0b
|
||
|
@@ -3043,22 +3134,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
lea srcq, [srcq+ssq*2]
|
||
|
jmp .w4_loop
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+48], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .w_start
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+48], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .w_start
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+48], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .w_start
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+48], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .w_start
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+48], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.w_start:
|
||
|
@@ -3264,6 +3360,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.dy1_w2:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
dec srcq
|
||
|
@@ -3351,6 +3448,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
RET
|
||
|
%endif
|
||
|
.dy1_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
vbroadcasti128 m7, [base+rescale_mul]
|
||
|
movzx t0d, t0b
|
||
|
@@ -3462,22 +3560,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
jg .dy1_w4_loop
|
||
|
MC_8TAP_SCALED_RET
|
||
|
.dy1_w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+72], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+72], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+72], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+72], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .dy1_w_start
|
||
|
.dy1_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+72], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.dy1_w_start:
|
||
|
@@ -3623,11 +3726,13 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
pblendw m3, m4, 0xaa
|
||
|
jmp .dy1_vloop
|
||
|
.dy2:
|
||
|
+ _CET_ENDBR
|
||
|
movzx wd, word [base+%1_8tap_scaled_avx2_dy2_table+wq*2]
|
||
|
add wq, base_reg
|
||
|
jmp wq
|
||
|
%ifidn %1, put
|
||
|
.dy2_w2:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
movzx t0d, t0b
|
||
|
dec srcq
|
||
|
@@ -3718,6 +3823,7 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
RET
|
||
|
%endif
|
||
|
.dy2_w4:
|
||
|
+ _CET_ENDBR
|
||
|
mov myd, mym
|
||
|
vbroadcasti128 m7, [base+rescale_mul]
|
||
|
movzx t0d, t0b
|
||
|
@@ -3822,22 +3928,27 @@ cglobal prep_8tap_scaled_8bpc, 4, 14, 16, 128, tmp, sr
|
||
|
jg .dy2_w4_loop
|
||
|
MC_8TAP_SCALED_RET
|
||
|
.dy2_w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+40], 1
|
||
|
movifprep tmp_stridem, 16
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w16:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+40], 2
|
||
|
movifprep tmp_stridem, 32
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w32:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+40], 4
|
||
|
movifprep tmp_stridem, 64
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w64:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+40], 8
|
||
|
movifprep tmp_stridem, 128
|
||
|
jmp .dy2_w_start
|
||
|
.dy2_w128:
|
||
|
+ _CET_ENDBR
|
||
|
mov dword [rsp+40], 16
|
||
|
movifprep tmp_stridem, 256
|
||
|
.dy2_w_start:
|
||
|
@@ -4187,6 +4298,7 @@ ALIGN function_align
|
||
|
lea tmp1d, [deltaq*3]
|
||
|
sub gammad, tmp1d ; gamma -= delta*3
|
||
|
.main2:
|
||
|
+ _CET_ENDBR
|
||
|
call .h
|
||
|
psrld m6, m5, 16
|
||
|
pblendw m6, m0, 0xaa ; 57
|
||
|
@@ -4202,6 +4314,7 @@ ALIGN function_align
|
||
|
ret
|
||
|
ALIGN function_align
|
||
|
.h:
|
||
|
+ _CET_ENDBR
|
||
|
lea tmp1d, [mxq+alphaq*4]
|
||
|
lea tmp2d, [mxq+alphaq*1]
|
||
|
vbroadcasti128 m10, [srcq]
|
||
|
@@ -4244,6 +4357,7 @@ ALIGN function_align
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movd [dstq ], xm0
|
||
|
pextrd [dstq+strideq*1], xm0, 1
|
||
|
@@ -4277,6 +4391,7 @@ ALIGN function_align
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movq [dstq ], xm0
|
||
|
movq [dstq+strideq*1], xm1
|
||
|
@@ -4290,6 +4405,7 @@ ALIGN function_align
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq ], xm0
|
||
|
vextracti128 [dstq+strideq*1], m0, 1
|
||
|
@@ -4305,6 +4421,7 @@ ALIGN function_align
|
||
|
%1 0
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], m0
|
||
|
%1 2
|
||
|
@@ -4318,6 +4435,7 @@ ALIGN function_align
|
||
|
%1 0
|
||
|
add dstq, strideq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq], m0
|
||
|
%1 2
|
||
|
@@ -4330,6 +4448,7 @@ ALIGN function_align
|
||
|
%1 0
|
||
|
add dstq, strideq
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+0*32], m0
|
||
|
%1 2
|
||
|
@@ -4499,6 +4618,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
lea r6, [dsq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [dstq+dsq*0]
|
||
|
pinsrd xm0, [dstq+dsq*1], 1
|
||
|
vpbroadcastd xm1, [dstq+dsq*2]
|
||
|
@@ -4526,6 +4646,7 @@ cglobal blend_8bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xm1, [dstq+dsq*0]
|
||
|
movhps xm1, [dstq+dsq*1]
|
||
|
vpbroadcastq m2, [dstq+dsq*2]
|
||
|
@@ -4556,6 +4677,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [maskq]
|
||
|
mova xm1, [dstq+dsq*0]
|
||
|
vinserti128 m1, [dstq+dsq*1], 1
|
||
|
@@ -4579,6 +4701,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [maskq]
|
||
|
mova m1, [dstq]
|
||
|
mova m6, [maskq+tmpq]
|
||
|
@@ -4610,6 +4733,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
add maskq, obmc_masks-blend_v_avx2_table
|
||
|
jmp wq
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd xm2, [maskq+2*2]
|
||
|
.w2_s0_loop:
|
||
|
movd xm0, [dstq+dsq*0]
|
||
|
@@ -4628,6 +4752,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq xm2, [maskq+4*2]
|
||
|
.w4_loop:
|
||
|
movd xm0, [dstq+dsq*0]
|
||
|
@@ -4646,6 +4771,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm3, [maskq+8*2]
|
||
|
.w8_loop:
|
||
|
movq xm0, [dstq+dsq*0]
|
||
|
@@ -4667,6 +4793,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m3, [maskq+16*2]
|
||
|
vbroadcasti128 m4, [maskq+16*3]
|
||
|
.w16_loop:
|
||
|
@@ -4689,6 +4816,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm3, [maskq+16*4]
|
||
|
vinserti128 m3, [maskq+16*6], 1
|
||
|
mova xm4, [maskq+16*5]
|
||
|
@@ -4726,6 +4854,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
neg hq
|
||
|
jmp wq
|
||
|
.w2:
|
||
|
+ _CET_ENDBR
|
||
|
movd xm0, [dstq+dsq*0]
|
||
|
pinsrw xm0, [dstq+dsq*1], 1
|
||
|
movd xm2, [maskq+hq*2]
|
||
|
@@ -4744,6 +4873,7 @@ cglobal blend_h_8bpc, 4, 7, 6, dst, ds, tmp, w, h, mas
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova xm3, [blend_shuf]
|
||
|
.w4_loop:
|
||
|
movd xm0, [dstq+dsq*0]
|
||
|
@@ -4764,6 +4894,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m4, [blend_shuf]
|
||
|
shufpd m4, m4, 0x03
|
||
|
.w8_loop:
|
||
|
@@ -4788,6 +4919,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti128 m4, [blend_shuf]
|
||
|
shufpd m4, m4, 0x0c
|
||
|
.w16_loop:
|
||
|
@@ -4812,6 +4944,7 @@ ALIGN function_align
|
||
|
RET
|
||
|
ALIGN function_align
|
||
|
.w32: ; w32/w64/w128
|
||
|
+ _CET_ENDBR
|
||
|
sub dsq, r6
|
||
|
.w32_loop0:
|
||
|
vpbroadcastw m3, [maskq+hq*2]
|
||
|
@@ -4913,6 +5046,7 @@ cglobal emu_edge_8bpc, 10, 13, 1, bw, bh, iw, ih, x, y
|
||
|
|
||
|
%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
|
||
|
.v_loop_%3:
|
||
|
+ _CET_ENDBR
|
||
|
%if %1
|
||
|
; left extension
|
||
|
xor r3, r3
|
||
|
@@ -5174,6 +5308,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
pextrd [dstq+strideq*1], xm0, 1
|
||
|
@@ -5199,6 +5334,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
movq [maskq], xm4
|
||
|
RET
|
||
|
.w4_h16:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
phaddd m4, m5
|
||
|
@@ -5226,6 +5362,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
add maskq, 8
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm2, m4, 1
|
||
|
vextracti128 xm1, m0, 1
|
||
|
psubw xm4, xm8, xm4
|
||
|
@@ -5247,6 +5384,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
add maskq, 16
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
vextracti128 [dstq+strideq*1], m0, 1
|
||
|
@@ -5272,6 +5410,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
add maskq, 16
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5296,6 +5435,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
W_MASK 0, 4, 0, 1
|
||
|
add dstq, strideq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5322,6 +5462,7 @@ cglobal w_mask_420_8bpc, 4, 8, 14, dst, stride, tmp1,
|
||
|
W_MASK 0, 4, 0, 1
|
||
|
add dstq, strideq
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5381,6 +5522,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
pextrd [dstq+strideq*1], xm0, 1
|
||
|
@@ -5403,6 +5545,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
mova [maskq], xm5
|
||
|
RET
|
||
|
.w4_h16:
|
||
|
+ _CET_ENDBR
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
packuswb m4, m5
|
||
|
@@ -5428,6 +5571,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
add maskq, 16
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm5, m4, 1
|
||
|
vextracti128 xm1, m0, 1
|
||
|
packuswb xm4, xm5
|
||
|
@@ -5449,6 +5593,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
add maskq, 32
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
vextracti128 [dstq+strideq*1], m0, 1
|
||
|
@@ -5471,6 +5616,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
add maskq, 32
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5491,6 +5637,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
add dstq, strideq
|
||
|
add maskq, 32
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5511,6 +5658,7 @@ cglobal w_mask_422_8bpc, 4, 8, 11, dst, stride, tmp1,
|
||
|
add dstq, strideq
|
||
|
add maskq, 32*2
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
W_MASK 0, 5, 2, 3
|
||
|
@@ -5551,6 +5699,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
pextrd [dstq+strideq*1], xm0, 1
|
||
|
@@ -5587,6 +5736,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
lea dstq, [dstq+strideq*4]
|
||
|
add maskq, 32
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti128 xm1, m0, 1
|
||
|
movq [dstq+strideq*0], xm0
|
||
|
movq [dstq+strideq*1], xm1
|
||
|
@@ -5603,6 +5753,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
lea dstq, [dstq+strideq*2]
|
||
|
add maskq, 32
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
vextracti128 [dstq+strideq*1], m0, 1
|
||
|
@@ -5617,6 +5768,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
add dstq, strideq
|
||
|
add maskq, 32
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq], m0
|
||
|
mova [maskq], m4
|
||
|
@@ -5630,6 +5782,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
add dstq, strideq
|
||
|
add maskq, 32*2
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
mova [maskq+32*0], m4
|
||
|
@@ -5647,6 +5800,7 @@ cglobal w_mask_444_8bpc, 4, 8, 8, dst, stride, tmp1, t
|
||
|
add dstq, strideq
|
||
|
add maskq, 32*4
|
||
|
.w128:
|
||
|
+ _CET_ENDBR
|
||
|
vpermq m0, m0, q3120
|
||
|
mova [dstq+32*0], m0
|
||
|
mova [maskq+32*0], m4
|