ports/multimedia/dav1d/patches/patch-src_x86_refmvs_asm

241 lines
5.8 KiB
Text

Index: src/x86/refmvs.asm
--- src/x86/refmvs.asm.orig
+++ src/x86/refmvs.asm
@@ -224,18 +224,21 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign,
jg .loop_y
RET
.write1:
+ _CET_ENDBR
movd [rpq+xq+0], m0
psrlq m0, 8
movd [rpq+xq+1], m0
add xq, 5*1
ret
.write2:
+ _CET_ENDBR
movq [rpq+xq+0], m0
psrlq m0, 8
movd [rpq+xq+6], m0
add xq, 5*2
ret
.write4:
+ _CET_ENDBR
pshufb m0, m9
movu [rpq+xq+ 0], m0
psrlq m0, 8
@@ -243,6 +246,7 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign,
add xq, 5*4
ret
.write8:
+ _CET_ENDBR
pshufb m2, m0, m9
movu [rpq+xq+ 0], m2
pshufb m0, m10
@@ -252,6 +256,7 @@ cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign,
add xq, 5*8
ret
.write16:
+ _CET_ENDBR
pshufb m2, m0, m9
movu [rpq+xq+ 0], m2
pshufb m0, m10
@@ -285,6 +290,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
lea aq, [aq+bx4q*4]
jmp bw4q
.w32:
+ _CET_ENDBR
mova [aq-16*16], m0
mova [aq-16*15], m1
mova [aq-16*14], m2
@@ -298,6 +304,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
mova [aq-16* 6], m1
mova [aq-16* 5], m2
.w16:
+ _CET_ENDBR
mova [aq-16* 4], m0
mova [aq-16* 3], m1
mova [aq-16* 2], m2
@@ -305,10 +312,12 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
mova [aq+16* 0], m1
mova [aq+16* 1], m2
.w8:
+ _CET_ENDBR
mova [aq+16* 2], m0
mova [aq+16* 3], m1
mova [aq+16* 4], m2
.w4:
+ _CET_ENDBR
mova [aq+16* 5], m0
mova [aq+16* 6], m1
mova [aq+16* 7], m2
@@ -316,12 +325,14 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
jg .loop
RET
.w2:
+ _CET_ENDBR
movu [aq+104], m0
movq [aq+120], m1
dec bh4d
jg .loop
RET
.w1:
+ _CET_ENDBR
movq [aq+116], m0
movd [aq+124], m2
dec bh4d
@@ -401,17 +412,20 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
jg .loop_y
RET
.write1:
+ _CET_ENDBR
movd [rpq+xq+ 0], xm0
pextrb [rpq+xq+ 4], xm0, 4
add xq, 5*1
ret
.write2:
+ _CET_ENDBR
movq [rpq+xq+0], xm0
psrlq xm1, xm0, 8
movd [rpq+xq+6], xm1
add xq, 5*2
ret
.write4:
+ _CET_ENDBR
pshufb xm1, xm0, xm8
movu [rpq+xq+ 0], xm1
psrlq xm1, 8
@@ -419,6 +433,7 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
add xq, 5*4
ret
.write8:
+ _CET_ENDBR
vinserti128 m1, m0, xm0, 1
pshufb m1, m8
movu [rpq+xq+ 0], m1
@@ -427,6 +442,7 @@ cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign
add xq, 5*8
ret
.write16:
+ _CET_ENDBR
vinserti128 m1, m0, xm0, 1
pshufb m2, m1, m8
movu [rpq+xq+ 0], m2
@@ -455,6 +471,7 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
lea aq, [aq+bx4q*4]
jmp bw4q
.w32:
+ _CET_ENDBR
mova [aq-32*8], m0
mova [aq-32*7], m1
mova [aq-32*6], m2
@@ -462,10 +479,12 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
mova [aq-32*4], m1
mova [aq-32*3], m2
.w16:
+ _CET_ENDBR
mova [aq-32*2], m0
mova [aq-32*1], m1
mova [aq+32*0], m2
.w8:
+ _CET_ENDBR
mova [aq+32*1], m0
mova [aq+32*2], m1
mova [aq+32*3], m2
@@ -473,18 +492,21 @@ cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
jg .loop
RET
.w4:
+ _CET_ENDBR
movu [aq+ 80], m0
mova [aq+112], xm1
dec bh4d
jg .loop
RET
.w2:
+ _CET_ENDBR
movu [aq+104], xm0
movq [aq+120], xm2
dec bh4d
jg .loop
RET
.w1:
+ _CET_ENDBR
movq [aq+116], xm0
movd [aq+124], xm1
dec bh4d
@@ -584,25 +606,30 @@ cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign
jg .loop_y
RET
.write1:
+ _CET_ENDBR
vmovdqu8 [rpq+xq]{k2}, xm0
add xq, 5*1
ret
.write2:
+ _CET_ENDBR
pshufb xm0, xm8
vmovdqu16 [rpq+xq]{k2}, xm0
add xq, 5*2
ret
.write4:
+ _CET_ENDBR
vpermb ym0, ym8, ym0
vmovdqu32 [rpq+xq]{k2}, ym0
add xq, 5*4
ret
.write8:
+ _CET_ENDBR
vpermb m0, m8, m0
vmovdqu64 [rpq+xq]{k2}, m0
add xq, 5*8
ret
.write16:
+ _CET_ENDBR
vpermb m1, m8, m0
movu [rpq+xq+ 0], m1
pshufb xm0, xm9
@@ -626,24 +653,28 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
kmovb k1, r1d
jmp bw4q
.w1:
+ _CET_ENDBR
mov r1, [rrq+r6*8]
vmovdqu16 [r1+bx4q*4]{k1}, xm0
inc r6
jl .w1
RET
.w2:
+ _CET_ENDBR
mov r1, [rrq+r6*8]
vmovdqu32 [r1+bx4q*4]{k1}, ym0
inc r6
jl .w2
RET
.w4:
+ _CET_ENDBR
mov r1, [rrq+r6*8]
vmovdqu64 [r1+bx4q*4]{k1}, m0
inc r6
jl .w4
RET
.w8:
+ _CET_ENDBR
pshufd ym1, ym0, q1021
.w8_loop:
mov r1, [rrq+r6*8+0]
@@ -656,6 +687,7 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
jl .w8_loop
RET
.w16:
+ _CET_ENDBR
pshufd m1, m0, q1021
pshufd m2, m0, q2102
.w16_loop:
@@ -671,6 +703,7 @@ cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
jl .w16_loop
RET
.w32:
+ _CET_ENDBR
pshufd m1, m0, q1021
pshufd m2, m0, q2102
.w32_loop: