ports/multimedia/dav1d/patches/patch-src_x86_mc_avx512_asm

1098 lines
34 KiB
Text

Index: src/x86/mc_avx512.asm
--- src/x86/mc_avx512.asm.orig
+++ src/x86/mc_avx512.asm
@@ -321,10 +321,12 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
test mxyd, mxyd
jnz .v
.put:
+ _CET_ENDBR
movzx wd, word [r7+wq*2+table_offset(put,)]
add wq, r7
jmp wq
.put_w2:
+ _CET_ENDBR
movzx r6d, word [srcq+ssq*0]
movzx r7d, word [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -335,6 +337,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w2
RET
.put_w4:
+ _CET_ENDBR
mov r6d, [srcq+ssq*0]
mov r7d, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -345,6 +348,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w4
RET
.put_w8:
+ _CET_ENDBR
mov r6, [srcq+ssq*0]
mov r7, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -355,6 +359,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w8
RET
.put_w16:
+ _CET_ENDBR
movu xmm0, [srcq+ssq*0]
movu xmm1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -365,6 +370,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w16
RET
.put_w32:
+ _CET_ENDBR
movu ym0, [srcq+ssq*0]
movu ym1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -375,6 +381,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w32
RET
.put_w64:
+ _CET_ENDBR
movu m0, [srcq+ssq*0]
movu m1, [srcq+ssq*1]
lea srcq, [srcq+ssq*2]
@@ -385,6 +392,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w64
RET
.put_w128:
+ _CET_ENDBR
movu m0, [srcq+ssq*0+64*0]
movu m1, [srcq+ssq*0+64*1]
movu m2, [srcq+ssq*1+64*0]
@@ -399,6 +407,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .put_w128
RET
.h:
+ _CET_ENDBR
; (16 * src[x] + (mx * (src[x + 1] - src[x])) + 8) >> 4
; = ((16 - mx) * src[x] + mx * src[x + 1] + 8) >> 4
imul mxyd, 0xff01
@@ -413,6 +422,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
add wq, r7
jmp wq
.h_w2:
+ _CET_ENDBR
movd xmm0, [srcq+ssq*0]
pinsrd xmm0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
@@ -427,6 +437,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w2
RET
.h_w4:
+ _CET_ENDBR
mova xmm4, [bilin_h_shuf4]
.h_w4_loop:
movq xmm0, [srcq+ssq*0]
@@ -443,6 +454,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
movu xm0, [srcq+ssq*0]
vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
@@ -457,6 +469,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w8
RET
.h_w16:
+ _CET_ENDBR
mova m4, [bilin_h_perm16]
.h_w16_loop:
movu ym0, [srcq+ssq*0]
@@ -473,6 +486,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w16_loop
RET
.h_w32:
+ _CET_ENDBR
movu ym0, [srcq+ssq*0+8*0]
vinserti32x8 m0, [srcq+ssq*1+8*0], 1
movu ym1, [srcq+ssq*0+8*1]
@@ -492,6 +506,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w32
RET
.h_w64:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
@@ -508,6 +523,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w64
RET
.h_w128:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m2, [srcq+8*1]
movu m1, [srcq+8*8]
@@ -525,6 +541,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .h_w128
RET
.v:
+ _CET_ENDBR
movzx wd, word [r7+wq*2+table_offset(put, _bilin_v)]
imul mxyd, 0xff01
vpbroadcastd m5, [pw_2048]
@@ -533,6 +550,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
vpbroadcastw m4, mxyd
jmp wq
.v_w2:
+ _CET_ENDBR
movd xmm0, [srcq+ssq*0]
.v_w2_loop:
pinsrw xmm1, xmm0, [srcq+ssq*1], 1 ; 0 1
@@ -550,6 +568,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w2_loop
RET
.v_w4:
+ _CET_ENDBR
movd xmm0, [srcq+ssq*0]
.v_w4_loop:
vpbroadcastd xmm1, [srcq+ssq*1]
@@ -568,6 +587,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w4_loop
RET
.v_w8:
+ _CET_ENDBR
movq xmm0, [srcq+ssq*0]
.v_w8_loop:
movq xmm3, [srcq+ssq*1]
@@ -587,6 +607,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w8_loop
RET
.v_w16:
+ _CET_ENDBR
movu xmm0, [srcq+ssq*0]
.v_w16_loop:
vbroadcasti128 ymm2, [srcq+ssq*1]
@@ -609,6 +630,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
vzeroupper
RET
.v_w32:
+ _CET_ENDBR
movu ym0, [srcq+ssq*0]
kxnorb k1, k1, k1
.v_w32_loop:
@@ -631,6 +653,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w32_loop
RET
.v_w64:
+ _CET_ENDBR
movu m0, [srcq+ssq*0]
.v_w64_loop:
movu m3, [srcq+ssq*1]
@@ -654,6 +677,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w64_loop
RET
.v_w128:
+ _CET_ENDBR
movu m0, [srcq+64*0]
movu m1, [srcq+64*1]
.v_w128_loop:
@@ -680,6 +704,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .v_w128_loop
RET
.hv:
+ _CET_ENDBR
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 128) >> 8
; = (src[x] + ((my * (src[x + src_stride] - src[x])) >> 4) + 8) >> 4
movzx wd, word [r7+wq*2+table_offset(put, _bilin_hv)]
@@ -690,6 +715,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
vpbroadcastw m6, mxyd
jmp wq
.hv_w2:
+ _CET_ENDBR
vpbroadcastd xmm0, [srcq+ssq*0]
pshufb xmm0, xm4
pmaddubsw xmm0, xm5
@@ -714,6 +740,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w2_loop
RET
.hv_w4:
+ _CET_ENDBR
mova xmm4, [bilin_h_shuf4]
movddup xmm0, [srcq+ssq*0]
pshufb xmm0, xmm4
@@ -739,6 +766,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w4_loop
RET
.hv_w8:
+ _CET_ENDBR
vbroadcasti128 ym0, [srcq+ssq*0]
pshufb ym0, ym4
pmaddubsw ym0, ym5
@@ -763,6 +791,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w8_loop
RET
.hv_w16:
+ _CET_ENDBR
vbroadcasti32x8 m0, [srcq+ssq*0]
mova m4, [bilin_h_perm16]
vpermb m0, m4, m0
@@ -788,6 +817,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w16_loop
RET
.hv_w32:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+ssq*0]
pmovzxbq m8, [pb_02461357]
@@ -817,6 +847,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w32_loop
RET
.hv_w64:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
pshufb m0, m4
@@ -850,6 +881,7 @@ cglobal put_bilin_8bpc, 4, 8, 0, dst, ds, src, ss, w,
jg .hv_w64_loop
RET
.hv_w128:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
movu m2, [srcq+8*8]
@@ -910,11 +942,13 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
test mxyd, mxyd
jnz .v
.prep:
+ _CET_ENDBR
movzx wd, word [t2+wq*2+table_offset(prep,)]
add wq, t2
lea stride3q, [strideq*3]
jmp wq
.prep_w4:
+ _CET_ENDBR
movd xmm0, [srcq+strideq*0]
pinsrd xmm0, [srcq+strideq*1], 1
pinsrd xmm0, [srcq+strideq*2], 2
@@ -928,6 +962,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w4
RET
.prep_w8:
+ _CET_ENDBR
movq xmm0, [srcq+strideq*0]
movq xmm1, [srcq+strideq*1]
vinserti128 ym0, ymm0, [srcq+strideq*2], 1
@@ -942,6 +977,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w8
RET
.prep_w16:
+ _CET_ENDBR
movu xmm0, [srcq+strideq*0]
vinserti128 ym0, ymm0, [srcq+strideq*1], 1
movu xmm1, [srcq+strideq*2]
@@ -958,6 +994,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w16
RET
.prep_w32:
+ _CET_ENDBR
pmovzxbw m0, [srcq+strideq*0]
pmovzxbw m1, [srcq+strideq*1]
pmovzxbw m2, [srcq+strideq*2]
@@ -973,6 +1010,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w32
RET
.prep_w64:
+ _CET_ENDBR
pmovzxbw m0, [srcq+strideq*0+32*0]
pmovzxbw m1, [srcq+strideq*0+32*1]
pmovzxbw m2, [srcq+strideq*1+32*0]
@@ -988,6 +1026,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w64
RET
.prep_w128:
+ _CET_ENDBR
pmovzxbw m0, [srcq+32*0]
pmovzxbw m1, [srcq+32*1]
pmovzxbw m2, [srcq+32*2]
@@ -1003,6 +1042,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .prep_w128
RET
.h:
+ _CET_ENDBR
; 16 * src[x] + (mx * (src[x + 1] - src[x]))
; = (16 - mx) * src[x] + mx * src[x + 1]
imul mxyd, 0xff01
@@ -1016,6 +1056,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
lea stride3q, [strideq*3]
jmp wq
.h_w4:
+ _CET_ENDBR
vbroadcasti32x4 ym4, [bilin_h_shuf4]
.h_w4_loop:
movq xmm0, [srcq+strideq*0]
@@ -1032,6 +1073,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
vbroadcasti32x4 m4, [bilin_h_shuf8]
.h_w8_loop:
movu xmm0, [srcq+strideq*0]
@@ -1047,6 +1089,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w8_loop
RET
.h_w16:
+ _CET_ENDBR
mova m4, [bilin_h_perm16]
.h_w16_loop:
movu ym0, [srcq+strideq*0]
@@ -1065,6 +1108,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w16_loop
RET
.h_w32:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
.h_w32_loop:
vpermb m0, m4, [srcq+strideq*0]
@@ -1085,6 +1129,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w32_loop
RET
.h_w64:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
.h_w64_loop:
vpermb m0, m4, [srcq+strideq*0+32*0]
@@ -1105,6 +1150,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w64_loop
RET
.h_w128:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
.h_w128_loop:
vpermb m0, m4, [srcq+32*0]
@@ -1125,6 +1171,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .h_w128_loop
RET
.v:
+ _CET_ENDBR
WIN64_SPILL_XMM 7
movzx wd, word [t2+wq*2+table_offset(prep, _bilin_v)]
imul mxyd, 0xff01
@@ -1134,6 +1181,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
vpbroadcastw m6, mxyd
jmp wq
.v_w4:
+ _CET_ENDBR
vpbroadcastd xm0, [srcq+strideq*0]
mov r3d, 0x29
vbroadcasti32x4 ym3, [bilin_v_shuf4]
@@ -1153,6 +1201,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w4_loop
RET
.v_w8:
+ _CET_ENDBR
mova m5, [bilin_v_perm8]
vbroadcasti32x4 ym0, [srcq+strideq*0]
.v_w8_loop:
@@ -1169,6 +1218,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w8_loop
RET
.v_w16:
+ _CET_ENDBR
mova m5, [bilin_v_perm16]
movu xm0, [srcq+strideq*0]
.v_w16_loop:
@@ -1188,6 +1238,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w16_loop
RET
.v_w32:
+ _CET_ENDBR
mova m5, [bilin_v_perm32]
movu ym0, [srcq+strideq*0]
.v_w32_loop:
@@ -1213,6 +1264,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w32_loop
RET
.v_w64:
+ _CET_ENDBR
mova m5, [bilin_v_perm64]
vpermq m0, m5, [srcq+strideq*0]
.v_w64_loop:
@@ -1236,6 +1288,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w64_loop
RET
.v_w128:
+ _CET_ENDBR
mova m5, [bilin_v_perm64]
vpermq m0, m5, [srcq+strideq*0+ 0]
vpermq m1, m5, [srcq+strideq*0+64]
@@ -1274,6 +1327,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .v_w128_loop
RET
.hv:
+ _CET_ENDBR
; (16 * src[x] + (my * (src[x + src_stride] - src[x])) + 8) >> 4
; = src[x] + (((my * (src[x + src_stride] - src[x])) + 8) >> 4)
%assign stack_offset stack_offset - stack_size_padded
@@ -1285,6 +1339,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
lea stride3q, [strideq*3]
jmp wq
.hv_w4:
+ _CET_ENDBR
vbroadcasti32x4 ym4, [bilin_h_shuf4]
vpbroadcastq ym0, [srcq+strideq*0]
pshufb ym0, ym4
@@ -1309,6 +1364,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .hv_w4_loop
RET
.hv_w8:
+ _CET_ENDBR
vbroadcasti32x4 m4, [bilin_h_shuf8]
vbroadcasti32x4 m0, [srcq+strideq*0]
pshufb m0, m4
@@ -1332,6 +1388,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .hv_w8_loop
RET
.hv_w16:
+ _CET_ENDBR
mova m4, [bilin_h_perm16]
vbroadcasti32x8 m0, [srcq+strideq*0]
vpermb m0, m4, m0
@@ -1361,6 +1418,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .hv_w16_loop
RET
.hv_w32:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+strideq*0]
pmaddubsw m0, m5
@@ -1383,6 +1441,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .hv_w32_loop
RET
.hv_w64:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+32*0]
vpermb m1, m4, [srcq+32*1]
@@ -1409,6 +1468,7 @@ cglobal prep_bilin_8bpc, 3, 7, 0, tmp, src, stride, w,
jg .hv_w64_loop
RET
.hv_w128:
+ _CET_ENDBR
mova m4, [bilin_h_perm32]
vpermb m0, m4, [srcq+32*0]
vpermb m1, m4, [srcq+32*1]
@@ -1525,6 +1585,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
%endif
jmp wq
.h:
+ _CET_ENDBR
test myd, 0xf00
jnz .hv
vpbroadcastd m5, [pd_34] ; 2 + (8 << 2)
@@ -1544,6 +1605,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
add wq, r8
jmp wq
.h_w2:
+ _CET_ENDBR
movzx mxd, mxb
dec srcq
mova xmm4, [subpel_h_shuf4]
@@ -1565,6 +1627,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w2_loop
RET
.h_w4:
+ _CET_ENDBR
movzx mxd, mxb
dec srcq
vpbroadcastd xmm3, [base+mxq*8+subpel_filters+2]
@@ -1588,6 +1651,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
movu xm0, [srcq+ssq*0]
vinserti32x4 ym0, [srcq+ssq*1], 1
lea srcq, [srcq+ssq*2]
@@ -1600,6 +1664,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w8
RET
.h_w16:
+ _CET_ENDBR
mova m6, [spel_h_perm16a]
mova m7, [spel_h_perm16b]
mova m8, [spel_h_perm16c]
@@ -1616,6 +1681,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w16_loop
RET
.h_w32:
+ _CET_ENDBR
movu ym0, [srcq+ssq*0+8*0]
vinserti32x8 m0, [srcq+ssq*1+8*0], 1
movu ym1, [srcq+ssq*0+8*1]
@@ -1631,6 +1697,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w32
RET
.h_w64:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m1, [srcq+8*1]
add srcq, ssq
@@ -1643,6 +1710,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w64
RET
.h_w128:
+ _CET_ENDBR
movu m0, [srcq+8*0]
movu m2, [srcq+8*1]
movu m1, [srcq+8*8]
@@ -1661,6 +1729,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .h_w128
RET
.v:
+ _CET_ENDBR
movzx mxd, myb
shr myd, 16
cmp hd, 6
@@ -1678,6 +1747,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
sub srcq, ss3q
jmp r6
.v_w2:
+ _CET_ENDBR
movd xmm2, [srcq+ssq*0]
pinsrw xmm2, [srcq+ssq*1], 2
pinsrw xmm2, [srcq+ssq*2], 4
@@ -1718,6 +1788,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .v_w2_loop
RET
.v_w4:
+ _CET_ENDBR
movd xmm2, [srcq+ssq*0]
pinsrd xmm2, [srcq+ssq*1], 1
pinsrd xmm2, [srcq+ssq*2], 2
@@ -1758,6 +1829,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .v_w4_loop
RET
.v_w8:
+ _CET_ENDBR
movq xmm1, [srcq+ssq*0]
vpbroadcastq ymm0, [srcq+ssq*1]
vpbroadcastq ymm2, [srcq+ssq*2]
@@ -1803,6 +1875,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
vzeroupper
RET
.v_w16:
+ _CET_ENDBR
mova m12, [spel_v_perm16]
vbroadcasti32x4 m1, [srcq+ssq*0]
vbroadcasti32x4 ym4, [srcq+ssq*1]
@@ -1847,6 +1920,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .v_w16_loop
RET
.v_w32:
+ _CET_ENDBR
mova m12, [spel_v_perm32]
pmovzxbq m14, [pb_02461357]
vpshrdw m13, m12, m12, 8
@@ -1902,6 +1976,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
RET
.v_w64:
.v_w128:
+ _CET_ENDBR
lea r6d, [hq+wq*4-256]
mov r4, srcq
mov r7, dstq
@@ -1992,6 +2067,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
vzeroupper
RET
.hv:
+ _CET_ENDBR
cmp wd, 4
jg .hv_w8
movzx mxd, mxb
@@ -2071,6 +2147,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
vzeroupper
RET
.hv_w4:
+ _CET_ENDBR
movq xmm1, [r6+ssq*0]
vpbroadcastq ym2, [r6+ssq*1]
vinserti32x4 ym1, ymm1, [r6+ssq*2], 1
@@ -2122,6 +2199,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
jg .hv_w4_loop
RET
.hv_w8:
+ _CET_ENDBR
shr mxd, 16
sub srcq, 3
vpbroadcastd m10, [base+subpel_filters+mxq*8+0]
@@ -2217,6 +2295,7 @@ cglobal put_8tap_8bpc, 4, 9, 0, dst, ds, src, ss, w, h
vzeroupper
RET
.hv_w16:
+ _CET_ENDBR
movu m7, [spel_hv_perm16a]
sub srcq, ss3q
mova m20, [spel_hv_perm16b]
@@ -2393,6 +2472,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
%endif
jmp wq
.h:
+ _CET_ENDBR
test myd, 0xf00
jnz .hv
vpbroadcastd m4, [pd_2]
@@ -2408,6 +2488,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
add wq, r7
jmp wq
.h_w4:
+ _CET_ENDBR
movzx mxd, mxb
vbroadcasti128 ym5, [subpel_h_shufA]
mov r3d, 0x4
@@ -2435,6 +2516,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .h_w4_loop
RET
.h_w8:
+ _CET_ENDBR
vbroadcasti128 m5, [subpel_h_shufA]
vbroadcasti128 m6, [subpel_h_shufB]
vbroadcasti128 m7, [subpel_h_shufC]
@@ -2462,6 +2544,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .h_w8_loop
RET
.h_w16:
+ _CET_ENDBR
mova m5, [spel_h_perm16a]
mova m6, [spel_h_perm16b]
mova m7, [spel_h_perm16c]
@@ -2478,6 +2561,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .h_w16_loop
RET
.h_w32:
+ _CET_ENDBR
mova m5, [spel_h_perm32a]
mova m6, [spel_h_perm32b]
mova m7, [spel_h_perm32c]
@@ -2491,9 +2575,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .h_w32_loop
RET
.h_w64:
+ _CET_ENDBR
xor r6d, r6d
jmp .h_start
.h_w128:
+ _CET_ENDBR
mov r6, -64*1
.h_start:
mova m5, [spel_h_perm32a]
@@ -2514,6 +2600,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .h_loop
RET
.v:
+ _CET_ENDBR
movzx mxd, myb ; Select 4-tap/8-tap filter multipliers.
shr myd, 16 ; Note that the code is 8-tap only, having
tzcnt wd, wd
@@ -2532,6 +2619,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
vpbroadcastw m11, [myq+6]
jmp wq
.v_w4:
+ _CET_ENDBR
movd xmm0, [srcq+strideq*0]
vpbroadcastd ymm1, [srcq+strideq*2]
vpbroadcastd xmm2, [srcq+strideq*1]
@@ -2577,6 +2665,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
vzeroupper
RET
.v_w8:
+ _CET_ENDBR
mov r3d, 0xf044
kmovw k1, r3d
kshiftrw k2, k1, 8
@@ -2627,6 +2716,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .v_w8_loop
RET
.v_w16:
+ _CET_ENDBR
mov r3d, 0xf0
kmovb k1, r3d
vbroadcasti128 m0, [srcq+strideq*0]
@@ -2688,6 +2778,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .v_w16_loop
RET
.v_w32:
+ _CET_ENDBR
mova m18, [bilin_v_perm64]
movu ym0, [srcq+strideq*0]
movu ym1, [srcq+strideq*1]
@@ -2751,9 +2842,11 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
vzeroupper
RET
.v_w64:
+ _CET_ENDBR
mov wd, 64
jmp .v_start
.v_w128:
+ _CET_ENDBR
mov wd, 128
.v_start:
WIN64_SPILL_XMM 27
@@ -2853,6 +2946,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .v_loop0
RET
.hv:
+ _CET_ENDBR
%assign stack_offset stack_offset - stack_size_padded
%assign stack_size_padded 0
WIN64_SPILL_XMM 16
@@ -2955,6 +3049,7 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
vzeroupper
RET
.hv_w8:
+ _CET_ENDBR
WIN64_SPILL_XMM 24
vbroadcasti128 m16, [subpel_h_shufA]
vbroadcasti128 m17, [subpel_h_shufB]
@@ -3040,15 +3135,19 @@ cglobal prep_8tap_8bpc, 3, 8, 0, tmp, src, stride, w,
jg .hv_w8_loop
RET
.hv_w16:
+ _CET_ENDBR
mov wd, 16*2
jmp .hv_start
.hv_w32:
+ _CET_ENDBR
mov wd, 32*2
jmp .hv_start
.hv_w64:
+ _CET_ENDBR
mov wd, 64*2
jmp .hv_start
.hv_w128:
+ _CET_ENDBR
mov wd, 128*2
.hv_start:
WIN64_SPILL_XMM 31
@@ -3280,6 +3379,7 @@ ALIGN function_align
ret
ALIGN function_align
.h:
+ _CET_ENDBR
movu xm5, [srcq+ssq*1]
psrad ym16, ym18, 10
lea srcq, [srcq+ssq*2]
@@ -3305,6 +3405,7 @@ ALIGN function_align
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
cmp hd, 8
jg .w4_h16
WRAP_YMM %1 0
@@ -3329,6 +3430,7 @@ ALIGN function_align
vpscatterdd [dstq+m7]{k1}, m0
RET
.w8:
+ _CET_ENDBR
cmp hd, 4
jne .w8_h8
WRAP_YMM %1 0
@@ -3362,6 +3464,7 @@ ALIGN function_align
%1_INC_PTR 2
lea dstq, [dstq+strideq*4]
.w16:
+ _CET_ENDBR
%1 0
vpermq m0, m0, q3120
mova [dstq ], xm0
@@ -3372,6 +3475,7 @@ ALIGN function_align
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
pmovzxbq m7, [pb_02461357]
.w32_loop:
%1 0
@@ -3384,6 +3488,7 @@ ALIGN function_align
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
pmovzxbq m7, [pb_02461357]
.w64_loop:
%1 0
@@ -3395,6 +3500,7 @@ ALIGN function_align
jg .w64_loop
RET
.w128:
+ _CET_ENDBR
pmovzxbq m7, [pb_02461357]
.w128_loop:
%1 0
@@ -3566,6 +3672,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
mova m5, [wm_420_perm4]
cmp hd, 8
jg .w4_h16
@@ -3600,6 +3707,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
vpscatterdd [dstq+m11]{k1}, m0
RET
.w8:
+ _CET_ENDBR
mova m5, [wm_420_perm8]
cmp hd, 4
jne .w8_h8
@@ -3643,6 +3751,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
mova m5, [wm_420_perm16]
.w16_loop:
W_MASK 0, 4, 0, 1
@@ -3664,6 +3773,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
pmovzxbq m5, [pb_02461357]
.w32_loop:
W_MASK 0, 4, 0, 1
@@ -3682,6 +3792,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
pmovzxbq m12, [wm_420_perm64] ; 0, 2, 4, 6, 8, 10, 12, 14
psrlq m13, m12, 4 ; 1, 3, 5, 7, 9, 11, 13, 15
.w64_loop:
@@ -3706,6 +3817,7 @@ cglobal w_mask_420_8bpc, 4, 8, 16, dst, stride, tmp1,
jg .w64_loop
RET
.w128:
+ _CET_ENDBR
pmovzxbq m14, [wm_420_perm64]
mova m10, [wm_420_mask]
psrlq m15, m14, 4
@@ -3760,6 +3872,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
cmp hd, 8
jg .w4_h16
WRAP_YMM W_MASK 0, 4, 0, 1
@@ -3793,6 +3906,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
vpscatterdd [dstq+m5]{k1}, m0
RET
.w8:
+ _CET_ENDBR
cmp hd, 4
jne .w8_h8
WRAP_YMM W_MASK 0, 4, 0, 1
@@ -3840,6 +3954,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
add maskq, 32
lea dstq, [dstq+strideq*4]
.w16:
+ _CET_ENDBR
W_MASK 0, 4, 0, 1
mova m1, m8
vpdpwssd m1, m4, m9
@@ -3855,6 +3970,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
pmovzxbq m5, [pb_02461357]
.w32_loop:
W_MASK 0, 4, 0, 1
@@ -3874,6 +3990,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
pmovzxbq m5, [pb_02461357]
.w64_loop:
W_MASK 0, 4, 0, 1
@@ -3892,6 +4009,7 @@ cglobal w_mask_422_8bpc, 4, 8, 14, dst, stride, tmp1,
jg .w64_loop
RET
.w128:
+ _CET_ENDBR
pmovzxbq m13, [pb_02461357]
.w128_loop:
W_MASK 0, 4, 0, 1
@@ -3930,6 +4048,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
cmp hd, 8
jg .w4_h16
WRAP_YMM W_MASK 0, 4, 0, 1, 1
@@ -3959,6 +4078,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
vpscatterdd [dstq+m9]{k1}, m0
RET
.w8:
+ _CET_ENDBR
cmp hd, 4
jne .w8_h8
WRAP_YMM W_MASK 0, 4, 0, 1, 1
@@ -4001,6 +4121,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
add maskq, 64
lea dstq, [dstq+strideq*4]
.w16:
+ _CET_ENDBR
W_MASK 0, 4, 0, 1, 1
vpermb m4, m8, m4
vpermq m0, m0, q3120
@@ -4013,6 +4134,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
pmovzxbq m9, [pb_02461357]
.w32_loop:
W_MASK 0, 4, 0, 1, 1
@@ -4029,6 +4151,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
pmovzxbq m9, [pb_02461357]
.w64_loop:
W_MASK 0, 4, 0, 1, 1
@@ -4044,6 +4167,7 @@ cglobal w_mask_444_8bpc, 4, 8, 12, dst, stride, tmp1,
jg .w64_loop
RET
.w128:
+ _CET_ENDBR
pmovzxbq m11, [pb_02461357]
.w128_loop:
W_MASK 0, 4, 0, 1, 1
@@ -4078,6 +4202,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
lea r6, [dsq*3]
jmp wq
.w4:
+ _CET_ENDBR
movd xmm0, [dstq+dsq*0]
pinsrd xmm0, [dstq+dsq*1], 1
vpbroadcastd xmm1, [dstq+dsq*2]
@@ -4104,6 +4229,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
jg .w4
RET
.w8:
+ _CET_ENDBR
movq xmm0, [dstq+dsq*0]
vpbroadcastq xmm1, [dstq+dsq*1]
vpbroadcastq ymm2, [dstq+dsq*2]
@@ -4134,6 +4260,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
vzeroupper
RET
.w16:
+ _CET_ENDBR
mova xm1, [dstq+dsq*0]
vinserti32x4 ym1, [dstq+dsq*1], 1
vinserti32x4 m1, [dstq+dsq*2], 2
@@ -4160,6 +4287,7 @@ cglobal blend_8bpc, 3, 7, 8, dst, ds, tmp, w, h, mask
jg .w16
RET
.w32:
+ _CET_ENDBR
mova ym1, [dstq+dsq*0]
vinserti32x8 m1, [dstq+dsq*1], 1
mova m4, [maskq]
@@ -4193,6 +4321,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
add maskq, obmc_masks-blend_v_avx512icl_table
jmp wq
.w2:
+ _CET_ENDBR
vpbroadcastd xmm2, [maskq+2*2]
.w2_s0_loop:
movd xmm0, [dstq+dsq*0]
@@ -4210,6 +4339,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w2_s0_loop
RET
.w4:
+ _CET_ENDBR
vpbroadcastq xmm2, [maskq+4*2]
.w4_loop:
movd xmm0, [dstq+dsq*0]
@@ -4227,6 +4357,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w4_loop
RET
.w8:
+ _CET_ENDBR
mova xmm3, [maskq+8*2]
.w8_loop:
movq xmm0, [dstq+dsq*0]
@@ -4247,6 +4378,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
vbroadcasti32x4 ym3, [maskq+16*2]
vbroadcasti32x4 ym4, [maskq+16*3]
.w16_loop:
@@ -4268,6 +4400,7 @@ cglobal blend_v_8bpc, 3, 6, 6, dst, ds, tmp, w, h, mas
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
mova m4, [maskq+32*2]
vshufi32x4 m3, m4, m4, q2020
vshufi32x4 m4, m4, q3131
@@ -4305,6 +4438,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
neg hq
jmp wq
.w2:
+ _CET_ENDBR
movd xmm0, [dstq+dsq*0]
pinsrw xmm0, [dstq+dsq*1], 1
movd xmm2, [maskq+hq*2]
@@ -4322,6 +4456,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w2
RET
.w4:
+ _CET_ENDBR
mova xmm3, [blend_shuf]
.w4_loop:
movd xmm0, [dstq+dsq*0]
@@ -4341,6 +4476,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w4_loop
RET
.w8:
+ _CET_ENDBR
vbroadcasti128 ymm4, [blend_shuf]
shufpd ymm4, ymm4, 0x03
.w8_loop:
@@ -4365,6 +4501,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
vzeroupper
RET
.w16:
+ _CET_ENDBR
vbroadcasti32x4 ym4, [blend_shuf]
shufpd ym4, ym4, 0x0c
.w16_loop:
@@ -4388,6 +4525,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w16_loop
RET
.w32:
+ _CET_ENDBR
vbroadcasti32x4 m4, [blend_shuf]
shufpd m4, m4, 0xf0
.w32_loop:
@@ -4411,6 +4549,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w32_loop
RET
.w64:
+ _CET_ENDBR
vpbroadcastw m3, [maskq+hq*2]
mova m1, [dstq]
mova m2, [tmpq]
@@ -4428,6 +4567,7 @@ cglobal blend_h_8bpc, 3, 7, 6, dst, ds, tmp, w, h, mas
jl .w64
RET
.w128:
+ _CET_ENDBR
vpbroadcastw m6, [maskq+hq*2]
mova m2, [dstq+64*0]
mova m1, [tmpq+64*0]