2023-08-16 22:26:55 +00:00
|
|
|
Index: src/x86/mc_avx512.asm
|
|
|
|
--- src/x86/mc_avx512.asm.orig
|
|
|
|
+++ src/x86/mc_avx512.asm
|
|
|
|
@@ -321,10 +321,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
test mxyd, mxyd
|
|
|
|
jnz .v
|
|
|
|
.put:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx wd, word [r7+wq*2+table_offset(put,)]
|
|
|
|
add wq, r7
|
|
|
|
jmp wq
|
|
|
|
.put_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx r6d, word [srcq+ssq*0]
|
|
|
|
movzx r7d, word [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -335,6 +337,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w2
|
|
|
|
RET
|
|
|
|
.put_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov r6d, [srcq+ssq*0]
|
|
|
|
mov r7d, [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -345,6 +348,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w4
|
|
|
|
RET
|
|
|
|
.put_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov r6, [srcq+ssq*0]
|
|
|
|
mov r7, [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -355,6 +359,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w8
|
|
|
|
RET
|
|
|
|
.put_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xmm0, [srcq+ssq*0]
|
|
|
|
movu xmm1, [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -365,6 +370,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w16
|
|
|
|
RET
|
|
|
|
.put_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu ym0, [srcq+ssq*0]
|
|
|
|
movu ym1, [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -375,6 +381,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w32
|
|
|
|
RET
|
|
|
|
.put_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+ssq*0]
|
|
|
|
movu m1, [srcq+ssq*1]
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -385,6 +392,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w64
|
|
|
|
RET
|
|
|
|
.put_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+ssq*0+64*0]
|
|
|
|
movu m1, [srcq+ssq*0+64*1]
|
|
|
|
movu m2, [srcq+ssq*1+64*0]
|
|
|
|
@@ -399,6 +407,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .put_w128
|
|
|
|
RET
|
|
|
|
.h:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
|
|
|
|
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
|
|
|
|
imul mxyd, 0xff01
|
|
|
|
@@ -413,6 +422,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
add wq, r7
|
|
|
|
jmp wq
|
|
|
|
.h_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [srcq+ssq*0]
|
|
|
|
pinsrd xmm0, [srcq+ssq*1], 1
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -427,6 +437,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w2
|
|
|
|
RET
|
|
|
|
.h_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova xmm4, [bilin_h_shuf4]
|
|
|
|
.h_w4_loop:
|
|
|
|
movq xmm0, [srcq+ssq*0]
|
|
|
|
@@ -443,6 +454,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w4_loop
|
|
|
|
RET
|
|
|
|
.h_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xm0, [srcq+ssq*0]
|
|
|
|
vinserti32x4 ym0, [srcq+ssq*1], 1
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -457,6 +469,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w8
|
|
|
|
RET
|
|
|
|
.h_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm16]
|
|
|
|
.h_w16_loop:
|
|
|
|
movu ym0, [srcq+ssq*0]
|
|
|
|
@@ -473,6 +486,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w16_loop
|
|
|
|
RET
|
|
|
|
.h_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu ym0, [srcq+ssq*0+8*0]
|
|
|
|
vinserti32x8 m0, [srcq+ssq*1+8*0], 1
|
|
|
|
movu ym1, [srcq+ssq*0+8*1]
|
|
|
|
@@ -492,6 +506,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w32
|
|
|
|
RET
|
|
|
|
.h_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m1, [srcq+8*1]
|
|
|
|
pshufb m0, m4
|
|
|
|
@@ -508,6 +523,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w64
|
|
|
|
RET
|
|
|
|
.h_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m2, [srcq+8*1]
|
|
|
|
movu m1, [srcq+8*8]
|
|
|
|
@@ -525,6 +541,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .h_w128
|
|
|
|
RET
|
|
|
|
.v:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
|
|
|
|
imul mxyd, 0xff01
|
|
|
|
vpbroadcastd m5, [pw_2048]
|
|
|
|
@@ -533,6 +550,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
vpbroadcastw m4, mxyd
|
|
|
|
jmp wq
|
|
|
|
.v_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [srcq+ssq*0]
|
|
|
|
.v_w2_loop:
|
|
|
|
pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
|
|
|
|
@@ -550,6 +568,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w2_loop
|
|
|
|
RET
|
|
|
|
.v_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [srcq+ssq*0]
|
|
|
|
.v_w4_loop:
|
|
|
|
vpbroadcastd xmm1, [srcq+ssq*1]
|
|
|
|
@@ -568,6 +587,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w4_loop
|
|
|
|
RET
|
|
|
|
.v_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movq xmm0, [srcq+ssq*0]
|
|
|
|
.v_w8_loop:
|
|
|
|
movq xmm3, [srcq+ssq*1]
|
|
|
|
@@ -587,6 +607,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w8_loop
|
|
|
|
RET
|
|
|
|
.v_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xmm0, [srcq+ssq*0]
|
|
|
|
.v_w16_loop:
|
|
|
|
vbroadcasti128 ymm2, [srcq+ssq*1]
|
|
|
|
@@ -609,6 +630,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.v_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu ym0, [srcq+ssq*0]
|
|
|
|
kxnorb k1, k1, k1
|
|
|
|
.v_w32_loop:
|
|
|
|
@@ -631,6 +653,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w32_loop
|
|
|
|
RET
|
|
|
|
.v_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+ssq*0]
|
|
|
|
.v_w64_loop:
|
|
|
|
movu m3, [srcq+ssq*1]
|
|
|
|
@@ -654,6 +677,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w64_loop
|
|
|
|
RET
|
|
|
|
.v_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+64*0]
|
|
|
|
movu m1, [srcq+64*1]
|
|
|
|
.v_w128_loop:
|
|
|
|
@@ -680,6 +704,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .v_w128_loop
|
|
|
|
RET
|
|
|
|
.hv:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
|
|
|
|
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
|
|
|
|
movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
|
|
|
|
@@ -690,6 +715,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
vpbroadcastw m6, mxyd
|
|
|
|
jmp wq
|
|
|
|
.hv_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastd xmm0, [srcq+ssq*0]
|
|
|
|
pshufb xmm0, xm4
|
|
|
|
pmaddubsw xmm0, xm5
|
|
|
|
@@ -714,6 +740,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w2_loop
|
|
|
|
RET
|
|
|
|
.hv_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova xmm4, [bilin_h_shuf4]
|
|
|
|
movddup xmm0, [srcq+ssq*0]
|
|
|
|
pshufb xmm0, xmm4
|
|
|
|
@@ -739,6 +766,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w4_loop
|
|
|
|
RET
|
|
|
|
.hv_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti128 ym0, [srcq+ssq*0]
|
|
|
|
pshufb ym0, ym4
|
|
|
|
pmaddubsw ym0, ym5
|
|
|
|
@@ -763,6 +791,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w8_loop
|
|
|
|
RET
|
|
|
|
.hv_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x8 m0, [srcq+ssq*0]
|
|
|
|
mova m4, [bilin_h_perm16]
|
|
|
|
vpermb m0, m4, m0
|
|
|
|
@@ -788,6 +817,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w16_loop
|
|
|
|
RET
|
|
|
|
.hv_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
vpermb m0, m4, [srcq+ssq*0]
|
|
|
|
pmovzxbq m8, [pb_02461357]
|
|
|
|
@@ -817,6 +847,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w32_loop
|
|
|
|
RET
|
|
|
|
.hv_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m1, [srcq+8*1]
|
|
|
|
pshufb m0, m4
|
|
|
|
@@ -850,6 +881,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
|
|
|
|
jg .hv_w64_loop
|
|
|
|
RET
|
|
|
|
.hv_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m1, [srcq+8*1]
|
|
|
|
movu m2, [srcq+8*8]
|
|
|
|
@@ -910,11 +942,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
test mxyd, mxyd
|
|
|
|
jnz .v
|
|
|
|
.prep:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx wd, word [t2+wq*2+table_offset(prep,)]
|
|
|
|
add wq, t2
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.prep_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [srcq+strideq*0]
|
|
|
|
pinsrd xmm0, [srcq+strideq*1], 1
|
|
|
|
pinsrd xmm0, [srcq+strideq*2], 2
|
|
|
|
@@ -928,6 +962,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w4
|
|
|
|
RET
|
|
|
|
.prep_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movq xmm0, [srcq+strideq*0]
|
|
|
|
movq xmm1, [srcq+strideq*1]
|
|
|
|
vinserti128 ym0, ymm0, [srcq+strideq*2], 1
|
|
|
|
@@ -942,6 +977,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w8
|
|
|
|
RET
|
|
|
|
.prep_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xmm0, [srcq+strideq*0]
|
|
|
|
vinserti128 ym0, ymm0, [srcq+strideq*1], 1
|
|
|
|
movu xmm1, [srcq+strideq*2]
|
|
|
|
@@ -958,6 +994,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w16
|
|
|
|
RET
|
|
|
|
.prep_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbw m0, [srcq+strideq*0]
|
|
|
|
pmovzxbw m1, [srcq+strideq*1]
|
|
|
|
pmovzxbw m2, [srcq+strideq*2]
|
|
|
|
@@ -973,6 +1010,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w32
|
|
|
|
RET
|
|
|
|
.prep_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbw m0, [srcq+strideq*0+32*0]
|
|
|
|
pmovzxbw m1, [srcq+strideq*0+32*1]
|
|
|
|
pmovzxbw m2, [srcq+strideq*1+32*0]
|
|
|
|
@@ -988,6 +1026,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w64
|
|
|
|
RET
|
|
|
|
.prep_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbw m0, [srcq+32*0]
|
|
|
|
pmovzxbw m1, [srcq+32*1]
|
|
|
|
pmovzxbw m2, [srcq+32*2]
|
|
|
|
@@ -1003,6 +1042,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .prep_w128
|
|
|
|
RET
|
|
|
|
.h:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
|
|
|
|
; = (16 - mx) * src[x] + mx * src[x + 1]
|
|
|
|
imul mxyd, 0xff01
|
|
|
|
@@ -1016,6 +1056,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.h_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 ym4, [bilin_h_shuf4]
|
|
|
|
.h_w4_loop:
|
|
|
|
movq xmm0, [srcq+strideq*0]
|
|
|
|
@@ -1032,6 +1073,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w4_loop
|
|
|
|
RET
|
|
|
|
.h_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 m4, [bilin_h_shuf8]
|
|
|
|
.h_w8_loop:
|
|
|
|
movu xmm0, [srcq+strideq*0]
|
|
|
|
@@ -1047,6 +1089,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w8_loop
|
|
|
|
RET
|
|
|
|
.h_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm16]
|
|
|
|
.h_w16_loop:
|
|
|
|
movu ym0, [srcq+strideq*0]
|
|
|
|
@@ -1065,6 +1108,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w16_loop
|
|
|
|
RET
|
|
|
|
.h_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
.h_w32_loop:
|
|
|
|
vpermb m0, m4, [srcq+strideq*0]
|
|
|
|
@@ -1085,6 +1129,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w32_loop
|
|
|
|
RET
|
|
|
|
.h_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
.h_w64_loop:
|
|
|
|
vpermb m0, m4, [srcq+strideq*0+32*0]
|
|
|
|
@@ -1105,6 +1150,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w64_loop
|
|
|
|
RET
|
|
|
|
.h_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
.h_w128_loop:
|
|
|
|
vpermb m0, m4, [srcq+32*0]
|
|
|
|
@@ -1125,6 +1171,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w128_loop
|
|
|
|
RET
|
|
|
|
.v:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
WIN64_SPILL_XMM 7
|
|
|
|
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
|
|
|
|
imul mxyd, 0xff01
|
|
|
|
@@ -1134,6 +1181,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
vpbroadcastw m6, mxyd
|
|
|
|
jmp wq
|
|
|
|
.v_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastd xm0, [srcq+strideq*0]
|
|
|
|
mov r3d, 0x29
|
|
|
|
vbroadcasti32x4 ym3, [bilin_v_shuf4]
|
|
|
|
@@ -1153,6 +1201,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w4_loop
|
|
|
|
RET
|
|
|
|
.v_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [bilin_v_perm8]
|
|
|
|
vbroadcasti32x4 ym0, [srcq+strideq*0]
|
|
|
|
.v_w8_loop:
|
|
|
|
@@ -1169,6 +1218,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w8_loop
|
|
|
|
RET
|
|
|
|
.v_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [bilin_v_perm16]
|
|
|
|
movu xm0, [srcq+strideq*0]
|
|
|
|
.v_w16_loop:
|
|
|
|
@@ -1188,6 +1238,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w16_loop
|
|
|
|
RET
|
|
|
|
.v_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [bilin_v_perm32]
|
|
|
|
movu ym0, [srcq+strideq*0]
|
|
|
|
.v_w32_loop:
|
|
|
|
@@ -1213,6 +1264,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w32_loop
|
|
|
|
RET
|
|
|
|
.v_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [bilin_v_perm64]
|
|
|
|
vpermq m0, m5, [srcq+strideq*0]
|
|
|
|
.v_w64_loop:
|
|
|
|
@@ -1236,6 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w64_loop
|
|
|
|
RET
|
|
|
|
.v_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [bilin_v_perm64]
|
|
|
|
vpermq m0, m5, [srcq+strideq*0+ 0]
|
|
|
|
vpermq m1, m5, [srcq+strideq*0+64]
|
|
|
|
@@ -1274,6 +1327,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w128_loop
|
|
|
|
RET
|
|
|
|
.hv:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
|
|
|
|
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
|
|
|
|
%assign stack_offset stack_offset - stack_size_padded
|
|
|
|
@@ -1285,6 +1339,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.hv_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 ym4, [bilin_h_shuf4]
|
|
|
|
vpbroadcastq ym0, [srcq+strideq*0]
|
|
|
|
pshufb ym0, ym4
|
|
|
|
@@ -1309,6 +1364,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w4_loop
|
|
|
|
RET
|
|
|
|
.hv_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 m4, [bilin_h_shuf8]
|
|
|
|
vbroadcasti32x4 m0, [srcq+strideq*0]
|
|
|
|
pshufb m0, m4
|
|
|
|
@@ -1332,6 +1388,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w8_loop
|
|
|
|
RET
|
|
|
|
.hv_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm16]
|
|
|
|
vbroadcasti32x8 m0, [srcq+strideq*0]
|
|
|
|
vpermb m0, m4, m0
|
|
|
|
@@ -1361,6 +1418,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w16_loop
|
|
|
|
RET
|
|
|
|
.hv_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
vpermb m0, m4, [srcq+strideq*0]
|
|
|
|
pmaddubsw m0, m5
|
|
|
|
@@ -1383,6 +1441,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w32_loop
|
|
|
|
RET
|
|
|
|
.hv_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
vpermb m0, m4, [srcq+32*0]
|
|
|
|
vpermb m1, m4, [srcq+32*1]
|
|
|
|
@@ -1409,6 +1468,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w64_loop
|
|
|
|
RET
|
|
|
|
.hv_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [bilin_h_perm32]
|
|
|
|
vpermb m0, m4, [srcq+32*0]
|
|
|
|
vpermb m1, m4, [srcq+32*1]
|
|
|
|
@@ -1525,6 +1585,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
%endif
|
|
|
|
jmp wq
|
|
|
|
.h:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
test myd, 0xf00
|
|
|
|
jnz .hv
|
|
|
|
vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
|
|
|
|
@@ -1544,6 +1605,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
add wq, r8
|
|
|
|
jmp wq
|
|
|
|
.h_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx mxd, mxb
|
|
|
|
dec srcq
|
|
|
|
mova xmm4, [subpel_h_shuf4]
|
|
|
|
@@ -1565,6 +1627,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w2_loop
|
|
|
|
RET
|
|
|
|
.h_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx mxd, mxb
|
|
|
|
dec srcq
|
|
|
|
vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
|
|
|
|
@@ -1588,6 +1651,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w4_loop
|
|
|
|
RET
|
|
|
|
.h_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xm0, [srcq+ssq*0]
|
|
|
|
vinserti32x4 ym0, [srcq+ssq*1], 1
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -1600,6 +1664,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w8
|
|
|
|
RET
|
|
|
|
.h_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m6, [spel_h_perm16a]
|
|
|
|
mova m7, [spel_h_perm16b]
|
|
|
|
mova m8, [spel_h_perm16c]
|
|
|
|
@@ -1616,6 +1681,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w16_loop
|
|
|
|
RET
|
|
|
|
.h_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu ym0, [srcq+ssq*0+8*0]
|
|
|
|
vinserti32x8 m0, [srcq+ssq*1+8*0], 1
|
|
|
|
movu ym1, [srcq+ssq*0+8*1]
|
|
|
|
@@ -1631,6 +1697,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w32
|
|
|
|
RET
|
|
|
|
.h_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m1, [srcq+8*1]
|
|
|
|
add srcq, ssq
|
|
|
|
@@ -1643,6 +1710,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w64
|
|
|
|
RET
|
|
|
|
.h_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m0, [srcq+8*0]
|
|
|
|
movu m2, [srcq+8*1]
|
|
|
|
movu m1, [srcq+8*8]
|
|
|
|
@@ -1661,6 +1729,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .h_w128
|
|
|
|
RET
|
|
|
|
.v:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx mxd, myb
|
|
|
|
shr myd, 16
|
|
|
|
cmp hd, 6
|
|
|
|
@@ -1678,6 +1747,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
sub srcq, ss3q
|
|
|
|
jmp r6
|
|
|
|
.v_w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm2, [srcq+ssq*0]
|
|
|
|
pinsrw xmm2, [srcq+ssq*1], 2
|
|
|
|
pinsrw xmm2, [srcq+ssq*2], 4
|
|
|
|
@@ -1718,6 +1788,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .v_w2_loop
|
|
|
|
RET
|
|
|
|
.v_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm2, [srcq+ssq*0]
|
|
|
|
pinsrd xmm2, [srcq+ssq*1], 1
|
|
|
|
pinsrd xmm2, [srcq+ssq*2], 2
|
|
|
|
@@ -1758,6 +1829,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .v_w4_loop
|
|
|
|
RET
|
|
|
|
.v_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movq xmm1, [srcq+ssq*0]
|
|
|
|
vpbroadcastq ymm0, [srcq+ssq*1]
|
|
|
|
vpbroadcastq ymm2, [srcq+ssq*2]
|
|
|
|
@@ -1803,6 +1875,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.v_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m12, [spel_v_perm16]
|
|
|
|
vbroadcasti32x4 m1, [srcq+ssq*0]
|
|
|
|
vbroadcasti32x4 ym4, [srcq+ssq*1]
|
|
|
|
@@ -1847,6 +1920,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .v_w16_loop
|
|
|
|
RET
|
|
|
|
.v_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m12, [spel_v_perm32]
|
|
|
|
pmovzxbq m14, [pb_02461357]
|
|
|
|
vpshrdw m13, m12, m12, 8
|
|
|
|
@@ -1902,6 +1976,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
RET
|
|
|
|
.v_w64:
|
|
|
|
.v_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
lea r6d, [hq+wq*4-256]
|
|
|
|
mov r4, srcq
|
|
|
|
mov r7, dstq
|
|
|
|
@@ -1992,6 +2067,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.hv:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp wd, 4
|
|
|
|
jg .hv_w8
|
|
|
|
movzx mxd, mxb
|
|
|
|
@@ -2071,6 +2147,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.hv_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movq xmm1, [r6+ssq*0]
|
|
|
|
vpbroadcastq ym2, [r6+ssq*1]
|
|
|
|
vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
|
|
|
|
@@ -2122,6 +2199,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
jg .hv_w4_loop
|
|
|
|
RET
|
|
|
|
.hv_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
shr mxd, 16
|
|
|
|
sub srcq, 3
|
|
|
|
vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
|
|
|
|
@@ -2217,6 +2295,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.hv_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu m7, [spel_hv_perm16a]
|
|
|
|
sub srcq, ss3q
|
|
|
|
mova m20, [spel_hv_perm16b]
|
|
|
|
@@ -2393,6 +2472,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
%endif
|
|
|
|
jmp wq
|
|
|
|
.h:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
test myd, 0xf00
|
|
|
|
jnz .hv
|
|
|
|
vpbroadcastd m4, [pd_2]
|
|
|
|
@@ -2408,6 +2488,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
add wq, r7
|
|
|
|
jmp wq
|
|
|
|
.h_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx mxd, mxb
|
|
|
|
vbroadcasti128 ym5, [subpel_h_shufA]
|
|
|
|
mov r3d, 0x4
|
|
|
|
@@ -2435,6 +2516,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w4_loop
|
|
|
|
RET
|
|
|
|
.h_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti128 m5, [subpel_h_shufA]
|
|
|
|
vbroadcasti128 m6, [subpel_h_shufB]
|
|
|
|
vbroadcasti128 m7, [subpel_h_shufC]
|
|
|
|
@@ -2462,6 +2544,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w8_loop
|
|
|
|
RET
|
|
|
|
.h_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [spel_h_perm16a]
|
|
|
|
mova m6, [spel_h_perm16b]
|
|
|
|
mova m7, [spel_h_perm16c]
|
|
|
|
@@ -2478,6 +2561,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w16_loop
|
|
|
|
RET
|
|
|
|
.h_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [spel_h_perm32a]
|
|
|
|
mova m6, [spel_h_perm32b]
|
|
|
|
mova m7, [spel_h_perm32c]
|
|
|
|
@@ -2491,9 +2575,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .h_w32_loop
|
|
|
|
RET
|
|
|
|
.h_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
xor r6d, r6d
|
|
|
|
jmp .h_start
|
|
|
|
.h_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov r6, -64*1
|
|
|
|
.h_start:
|
|
|
|
mova m5, [spel_h_perm32a]
|
|
|
|
@@ -2514,6 +2600,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .h_loop
|
|
|
|
RET
|
|
|
|
.v:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
|
|
|
|
shr myd, 16 ; Note that the code is 8-tap only, having
|
|
|
|
tzcnt wd, wd
|
|
|
|
@@ -2532,6 +2619,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
vpbroadcastw m11, [myq+6]
|
|
|
|
jmp wq
|
|
|
|
.v_w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [srcq+strideq*0]
|
|
|
|
vpbroadcastd ymm1, [srcq+strideq*2]
|
|
|
|
vpbroadcastd xmm2, [srcq+strideq*1]
|
|
|
|
@@ -2577,6 +2665,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.v_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov r3d, 0xf044
|
|
|
|
kmovw k1, r3d
|
|
|
|
kshiftrw k2, k1, 8
|
|
|
|
@@ -2627,6 +2716,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w8_loop
|
|
|
|
RET
|
|
|
|
.v_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov r3d, 0xf0
|
|
|
|
kmovb k1, r3d
|
|
|
|
vbroadcasti128 m0, [srcq+strideq*0]
|
|
|
|
@@ -2688,6 +2778,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .v_w16_loop
|
|
|
|
RET
|
|
|
|
.v_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m18, [bilin_v_perm64]
|
|
|
|
movu ym0, [srcq+strideq*0]
|
|
|
|
movu ym1, [srcq+strideq*1]
|
|
|
|
@@ -2751,9 +2842,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.v_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 64
|
|
|
|
jmp .v_start
|
|
|
|
.v_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 128
|
|
|
|
.v_start:
|
|
|
|
WIN64_SPILL_XMM 27
|
|
|
|
@@ -2853,6 +2946,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .v_loop0
|
|
|
|
RET
|
|
|
|
.hv:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
%assign stack_offset stack_offset - stack_size_padded
|
|
|
|
%assign stack_size_padded 0
|
|
|
|
WIN64_SPILL_XMM 16
|
|
|
|
@@ -2955,6 +3049,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.hv_w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
WIN64_SPILL_XMM 24
|
|
|
|
vbroadcasti128 m16, [subpel_h_shufA]
|
|
|
|
vbroadcasti128 m17, [subpel_h_shufB]
|
|
|
|
@@ -3040,15 +3135,19 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
|
|
|
|
jg .hv_w8_loop
|
|
|
|
RET
|
|
|
|
.hv_w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 16*2
|
|
|
|
jmp .hv_start
|
|
|
|
.hv_w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 32*2
|
|
|
|
jmp .hv_start
|
|
|
|
.hv_w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 64*2
|
|
|
|
jmp .hv_start
|
|
|
|
.hv_w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mov wd, 128*2
|
|
|
|
.hv_start:
|
|
|
|
WIN64_SPILL_XMM 31
|
|
|
|
@@ -3280,6 +3379,7 @@ ALIGN function_align
|
|
|
|
ret
|
|
|
|
ALIGN function_align
|
|
|
|
.h:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movu xm5, [srcq+ssq*1]
|
|
|
|
psrad ym16, ym18, 10
|
|
|
|
lea srcq, [srcq+ssq*2]
|
|
|
|
@@ -3305,6 +3405,7 @@ ALIGN function_align
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 8
|
|
|
|
jg .w4_h16
|
|
|
|
WRAP_YMM %1 0
|
|
|
|
@@ -3329,6 +3430,7 @@ ALIGN function_align
|
|
|
|
vpscatterdd [dstq+m7]{k1}, m0
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 4
|
|
|
|
jne .w8_h8
|
|
|
|
WRAP_YMM %1 0
|
|
|
|
@@ -3362,6 +3464,7 @@ ALIGN function_align
|
|
|
|
%1_INC_PTR 2
|
|
|
|
lea dstq, [dstq+strideq*4]
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
%1 0
|
|
|
|
vpermq m0, m0, q3120
|
|
|
|
mova [dstq ], xm0
|
|
|
|
@@ -3372,6 +3475,7 @@ ALIGN function_align
|
|
|
|
jg .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m7, [pb_02461357]
|
|
|
|
.w32_loop:
|
|
|
|
%1 0
|
|
|
|
@@ -3384,6 +3488,7 @@ ALIGN function_align
|
|
|
|
jg .w32_loop
|
|
|
|
RET
|
|
|
|
.w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m7, [pb_02461357]
|
|
|
|
.w64_loop:
|
|
|
|
%1 0
|
|
|
|
@@ -3395,6 +3500,7 @@ ALIGN function_align
|
|
|
|
jg .w64_loop
|
|
|
|
RET
|
|
|
|
.w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m7, [pb_02461357]
|
|
|
|
.w128_loop:
|
|
|
|
%1 0
|
|
|
|
@@ -3566,6 +3672,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [wm_420_perm4]
|
|
|
|
cmp hd, 8
|
|
|
|
jg .w4_h16
|
|
|
|
@@ -3600,6 +3707,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
vpscatterdd [dstq+m11]{k1}, m0
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [wm_420_perm8]
|
|
|
|
cmp hd, 4
|
|
|
|
jne .w8_h8
|
|
|
|
@@ -3643,6 +3751,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
jg .w8_loop
|
|
|
|
RET
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m5, [wm_420_perm16]
|
|
|
|
.w16_loop:
|
|
|
|
W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3664,6 +3773,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
jg .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m5, [pb_02461357]
|
|
|
|
.w32_loop:
|
|
|
|
W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3682,6 +3792,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
jg .w32_loop
|
|
|
|
RET
|
|
|
|
.w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
|
|
|
|
psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
|
|
|
|
.w64_loop:
|
|
|
|
@@ -3706,6 +3817,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
|
|
|
|
jg .w64_loop
|
|
|
|
RET
|
|
|
|
.w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m14, [wm_420_perm64]
|
|
|
|
mova m10, [wm_420_mask]
|
|
|
|
psrlq m15, m14, 4
|
|
|
|
@@ -3760,6 +3872,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 8
|
|
|
|
jg .w4_h16
|
|
|
|
WRAP_YMM W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3793,6 +3906,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
vpscatterdd [dstq+m5]{k1}, m0
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 4
|
|
|
|
jne .w8_h8
|
|
|
|
WRAP_YMM W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3840,6 +3954,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
add maskq, 32
|
|
|
|
lea dstq, [dstq+strideq*4]
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
W_MASK 0, 4, 0, 1
|
|
|
|
mova m1, m8
|
|
|
|
vpdpwssd m1, m4, m9
|
|
|
|
@@ -3855,6 +3970,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
jg .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m5, [pb_02461357]
|
|
|
|
.w32_loop:
|
|
|
|
W_MASK 0, 4, 0, 1
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -3874,6 +3990,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
jg .w32_loop
|
|
|
|
RET
|
|
|
|
.w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m5, [pb_02461357]
|
|
|
|
.w64_loop:
|
|
|
|
W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3892,6 +4009,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
|
|
|
|
jg .w64_loop
|
|
|
|
RET
|
|
|
|
.w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m13, [pb_02461357]
|
|
|
|
.w128_loop:
|
|
|
|
W_MASK 0, 4, 0, 1
|
|
|
|
@@ -3930,6 +4048,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
2023-08-16 22:26:55 +00:00
|
|
|
lea stride3q, [strideq*3]
|
|
|
|
jmp wq
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 8
|
|
|
|
jg .w4_h16
|
|
|
|
WRAP_YMM W_MASK 0, 4, 0, 1, 1
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -3959,6 +4078,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
2023-08-16 22:26:55 +00:00
|
|
|
vpscatterdd [dstq+m9]{k1}, m0
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
cmp hd, 4
|
|
|
|
jne .w8_h8
|
|
|
|
WRAP_YMM W_MASK 0, 4, 0, 1, 1
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4001,6 +4121,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
2023-08-16 22:26:55 +00:00
|
|
|
add maskq, 64
|
|
|
|
lea dstq, [dstq+strideq*4]
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
W_MASK 0, 4, 0, 1, 1
|
|
|
|
vpermb m4, m8, m4
|
|
|
|
vpermq m0, m0, q3120
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4013,6 +4134,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m9, [pb_02461357]
|
|
|
|
.w32_loop:
|
|
|
|
W_MASK 0, 4, 0, 1, 1
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4029,6 +4151,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
|
|
|
jg .w32_loop
|
|
|
|
RET
|
|
|
|
.w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m9, [pb_02461357]
|
|
|
|
.w64_loop:
|
|
|
|
W_MASK 0, 4, 0, 1, 1
|
|
|
|
@@ -4044,6 +4167,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
|
|
|
|
jg .w64_loop
|
|
|
|
RET
|
|
|
|
.w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
pmovzxbq m11, [pb_02461357]
|
|
|
|
.w128_loop:
|
|
|
|
W_MASK 0, 4, 0, 1, 1
|
|
|
|
@@ -4078,6 +4202,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
|
2023-08-16 22:26:55 +00:00
|
|
|
lea r6, [dsq*3]
|
|
|
|
jmp wq
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [dstq+dsq*0]
|
|
|
|
pinsrd xmm0, [dstq+dsq*1], 1
|
|
|
|
vpbroadcastd xmm1, [dstq+dsq*2]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4104,6 +4229,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w4
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movq xmm0, [dstq+dsq*0]
|
|
|
|
vpbroadcastq xmm1, [dstq+dsq*1]
|
|
|
|
vpbroadcastq ymm2, [dstq+dsq*2]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4134,6 +4260,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
|
2023-08-16 22:26:55 +00:00
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova xm1, [dstq+dsq*0]
|
|
|
|
vinserti32x4 ym1, [dstq+dsq*1], 1
|
|
|
|
vinserti32x4 m1, [dstq+dsq*2], 2
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4160,6 +4287,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w16
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova ym1, [dstq+dsq*0]
|
|
|
|
vinserti32x8 m1, [dstq+dsq*1], 1
|
|
|
|
mova m4, [maskq]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4193,6 +4321,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
add maskq, obmc_masks-blend_v_avx512icl_table
|
|
|
|
jmp wq
|
|
|
|
.w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastd xmm2, [maskq+2*2]
|
|
|
|
.w2_s0_loop:
|
|
|
|
movd xmm0, [dstq+dsq*0]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4210,6 +4339,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w2_s0_loop
|
|
|
|
RET
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastq xmm2, [maskq+4*2]
|
|
|
|
.w4_loop:
|
|
|
|
movd xmm0, [dstq+dsq*0]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4227,6 +4357,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w4_loop
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova xmm3, [maskq+8*2]
|
|
|
|
.w8_loop:
|
|
|
|
movq xmm0, [dstq+dsq*0]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4247,6 +4378,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w8_loop
|
|
|
|
RET
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 ym3, [maskq+16*2]
|
|
|
|
vbroadcasti32x4 ym4, [maskq+16*3]
|
|
|
|
.w16_loop:
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4268,6 +4400,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jg .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova m4, [maskq+32*2]
|
|
|
|
vshufi32x4 m3, m4, m4, q2020
|
|
|
|
vshufi32x4 m4, m4, q3131
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4305,6 +4438,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
neg hq
|
|
|
|
jmp wq
|
|
|
|
.w2:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
movd xmm0, [dstq+dsq*0]
|
|
|
|
pinsrw xmm0, [dstq+dsq*1], 1
|
|
|
|
movd xmm2, [maskq+hq*2]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4322,6 +4456,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jl .w2
|
|
|
|
RET
|
|
|
|
.w4:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
mova xmm3, [blend_shuf]
|
|
|
|
.w4_loop:
|
|
|
|
movd xmm0, [dstq+dsq*0]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4341,6 +4476,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jl .w4_loop
|
|
|
|
RET
|
|
|
|
.w8:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti128 ymm4, [blend_shuf]
|
|
|
|
shufpd ymm4, ymm4, 0x03
|
|
|
|
.w8_loop:
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4365,6 +4501,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
vzeroupper
|
|
|
|
RET
|
|
|
|
.w16:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 ym4, [blend_shuf]
|
|
|
|
shufpd ym4, ym4, 0x0c
|
|
|
|
.w16_loop:
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4388,6 +4525,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jl .w16_loop
|
|
|
|
RET
|
|
|
|
.w32:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vbroadcasti32x4 m4, [blend_shuf]
|
|
|
|
shufpd m4, m4, 0xf0
|
|
|
|
.w32_loop:
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4411,6 +4549,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jl .w32_loop
|
|
|
|
RET
|
|
|
|
.w64:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastw m3, [maskq+hq*2]
|
|
|
|
mova m1, [dstq]
|
|
|
|
mova m2, [tmpq]
|
2024-05-26 03:08:12 +00:00
|
|
|
@@ -4428,6 +4567,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
|
2023-08-16 22:26:55 +00:00
|
|
|
jl .w64
|
|
|
|
RET
|
|
|
|
.w128:
|
|
|
|
+ _CET_ENDBR
|
|
|
|
vpbroadcastw m6, [maskq+hq*2]
|
|
|
|
mova m2, [dstq+64*0]
|
|
|
|
mova m1, [tmpq+64*0]
|