ports/multimedia/dav1d/patches/patch-src_x86_ipred16_avx512_asm

204 lines
6.4 KiB
Text
Raw Normal View History

Index: src/x86/ipred16_avx512.asm
--- src/x86/ipred16_avx512.asm.orig
+++ src/x86/ipred16_avx512.asm
@@ -104,6 +104,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
add wq, r6
jmp wq
.w4:
+ _CET_ENDBR
vpbroadcastq m4, [tlq+2] ; top
movsldup m7, [base+ipred_shuf]
lea r6, [strideq*3]
@@ -133,6 +134,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
.w4_end:
RET
.w8:
+ _CET_ENDBR
vbroadcasti32x4 m4, [tlq+2]
movsldup m7, [base+ipred_shuf]
lea r6, [strideq*3]
@@ -152,6 +154,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
vbroadcasti32x8 m4, [tlq+2]
movsldup m7, [base+ipred_shuf]
psubw m5, m4, m3
@@ -168,6 +171,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
movu m4, [tlq+2]
psubw m5, m4, m3
pabsw m6, m5
@@ -181,6 +185,7 @@ cglobal ipred_paeth_16bpc, 3, 7, 10, dst, stride, tl,
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
movu m4, [tlq+ 2]
movu m7, [tlq+66]
psubw m5, m4, m3
@@ -212,6 +217,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
vpbroadcastq m5, [tlq+2] ; top
movsldup m4, [ipred_shuf]
psubw m5, m6 ; top - bottom
@@ -239,6 +245,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
.end:
RET
.w8:
+ _CET_ENDBR
vbroadcasti32x4 m5, [tlq+2] ; top
movsldup m4, [ipred_shuf]
psubw m5, m6 ; top - bottom
@@ -256,6 +263,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
jl .w8_loop
RET
.w16:
+ _CET_ENDBR
vbroadcasti32x8 m5, [tlq+2] ; top
movsldup m4, [ipred_shuf]
psubw m5, m6 ; top - bottom
@@ -277,6 +285,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
jl .w16_loop
RET
.w32:
+ _CET_ENDBR
movu m5, [tlq+2]
psubw m5, m6
.w32_loop:
@@ -295,6 +304,7 @@ cglobal ipred_smooth_v_16bpc, 3, 7, 7, dst, stride, tl
jl .w32_loop
RET
.w64:
+ _CET_ENDBR
movu m4, [tlq+ 2]
movu m5, [tlq+66]
psubw m4, m6
@@ -329,6 +339,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
lea wq, [base+ipred_smooth_h_16bpc_avx512icl_table+wq]
jmp wq
.w4:
+ _CET_ENDBR
movsldup m4, [base+ipred_shuf]
vpbroadcastq m5, [base+smooth_weights_1d_16bpc+4*2]
.w4_loop:
@@ -356,6 +367,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
.end:
RET
.w8:
+ _CET_ENDBR
movsldup m4, [base+ipred_shuf]
vbroadcasti32x4 m5, [base+smooth_weights_1d_16bpc+8*2]
.w8_loop:
@@ -373,6 +385,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
movsldup m4, [base+ipred_shuf]
vbroadcasti32x8 m5, [base+smooth_weights_1d_16bpc+16*2]
.w16_loop:
@@ -395,6 +408,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
movu m5, [base+smooth_weights_1d_16bpc+32*2]
.w32_loop:
vpbroadcastq m3, [tlq+hq-8]
@@ -415,6 +429,7 @@ cglobal ipred_smooth_h_16bpc, 3, 7, 7, dst, stride, tl
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
movu m4, [base+smooth_weights_1d_16bpc+64*2]
movu m5, [base+smooth_weights_1d_16bpc+64*3]
.w64_loop:
@@ -456,6 +471,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
lea v_weightsq, [base+smooth_weights_2d_16bpc+hq*2]
jmp wq
.w4:
+ _CET_ENDBR
vpbroadcastq m5, [tlq+hq+2]
movshdup m3, [base+ipred_shuf]
movsldup m4, [base+ipred_shuf]
@@ -483,6 +499,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
jg .w4_loop
RET
.w8:
+ _CET_ENDBR
vbroadcasti32x4 ym5, [tlq+hq+2]
movshdup m6, [base+ipred_shuf]
movsldup m7, [base+ipred_shuf]
@@ -517,6 +534,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
jg .w8_loop
RET
.w16:
+ _CET_ENDBR
pmovzxwd m5, [tlq+hq+2]
mova m6, [base+smooth_weights_2d_16bpc+16*4]
vpblendmw m5{k1}, m0, m5 ; top, bottom
@@ -541,6 +559,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
jg .w16_loop
RET
.w32:
+ _CET_ENDBR
pmovzxwd m5, [tlq+hq+ 2]
pmovzxwd m6, [tlq+hq+34]
mova m7, [base+smooth_weights_2d_16bpc+32*4]
@@ -574,6 +593,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl,
jg .w32_loop
RET
.w64:
+ _CET_ENDBR
pmovzxwd m5, [tlq+hq+ 2]
pmovzxwd m6, [tlq+hq+34]
pmovzxwd m7, [tlq+hq+66]
@@ -621,6 +641,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
lea stride3q, [strideq*3]
jmp wq
.w4:
+ _CET_ENDBR
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
@@ -634,6 +655,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
jg .w4
RET
.w8:
+ _CET_ENDBR
pmovzxbw m0, [idxq]
add idxq, 32
vpermw m0, m0, m3
@@ -646,6 +668,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
jg .w8
RET
.w16:
+ _CET_ENDBR
vpermb m1, m2, [idxq]
add idxq, 64
vpermw m0, m1, m3
@@ -660,6 +683,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
jg .w16
RET
.w32:
+ _CET_ENDBR
vpermb m1, m2, [idxq]
add idxq, 64
vpermw m0, m1, m3
@@ -672,6 +696,7 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx
jg .w32
RET
.w64:
+ _CET_ENDBR
vpermb m1, m2, [idxq]
add idxq, 64
vpermw m0, m1, m3