ports/multimedia/dav1d/patches/patch-src_x86_itx16_avx512_asm

156 lines
5.5 KiB
Text
Raw Normal View History

2023-08-16 22:26:55 +00:00
Index: src/x86/itx16_avx512.asm
--- src/x86/itx16_avx512.asm.orig
+++ src/x86/itx16_avx512.asm
@@ -274,6 +274,7 @@ cglobal inv_txfm_add_%1_%2_%4_10bpc, 4, 7, 0, dst, str
times ((%%end - %%p1) >> 31) & 1 jmp %%p1
ALIGN function_align
%%end:
+ _CET_ENDBR
%endif
%endmacro
@@ -330,6 +331,7 @@ cglobal idct_8x8_internal_10bpc, 0, 7, 16, dst, stride
punpckldq m0, m2
jmp tx2q
.pass2:
+ _CET_ENDBR
lea r5, [o_base_8bpc]
vextracti32x8 ym2, m0, 1
vextracti32x8 ym3, m1, 1
@@ -461,6 +463,7 @@ cglobal iadst_8x8_internal_10bpc, 0, 7, 16, dst, strid
vpermt2q m0, m2, m3
jmp tx2q
.pass2:
+ _CET_ENDBR
call .main_pass2
movu m10, [permC+2]
vbroadcasti32x8 m12, [pw_2048_m2048+16]
@@ -531,6 +534,7 @@ cglobal iflipadst_8x8_internal_10bpc, 0, 7, 16, dst, s
paddd m4, m11
jmp m(iadst_8x8_internal_10bpc).pass1_end
.pass2:
+ _CET_ENDBR
call m(iadst_8x8_internal_10bpc).main_pass2
movu m10, [permC+1]
vbroadcasti32x8 m12, [pw_m2048_2048+16]
@@ -561,6 +565,7 @@ cglobal iidentity_8x8_internal_10bpc, 0, 7, 16, dst, s
punpckhdq m1, m2 ; 2 3 6 7
jmp tx2q
.pass2:
+ _CET_ENDBR
movu m3, [o(permC+2)]
vpbroadcastd m12, [o(pw_4096)]
psrlq m2, m3, 32
@@ -600,6 +605,7 @@ cglobal idct_8x16_internal_10bpc, 0, 7, 16, dst, strid
packssdw m3, m7
jmp tx2q
.pass2:
+ _CET_ENDBR
mova m8, [o(idct8x16p)]
REPX {vpermb x, m8, x}, m0, m1, m2, m3
punpckhdq m5, m0, m1
@@ -829,6 +835,7 @@ cglobal iadst_8x16_internal_10bpc, 0, 7, 16, dst, stri
vextracti32x8 ym3, m3, 1
jmp tx2q
.pass2:
+ _CET_ENDBR
call .pass2_main
movu m4, [permB+2]
vbroadcasti32x8 m12, [pw_2048_m2048+16]
@@ -964,6 +971,7 @@ cglobal iflipadst_8x16_internal_10bpc, 0, 7, 16, dst,
paddd m4, m11
jmp m(iadst_8x16_internal_10bpc).fast_end
.pass2:
+ _CET_ENDBR
call m(iadst_8x16_internal_10bpc).pass2_main
movu m7, [permB+2]
vbroadcasti32x8 m12, [pw_m2048_2048+16]
@@ -985,6 +993,7 @@ cglobal iidentity_8x16_internal_10bpc, 0, 7, 16, dst,
call m(idct_8x16_internal_10bpc).load2
jmp m(idct_8x16_internal_10bpc).pass1_end
.pass2:
+ _CET_ENDBR
vpbroadcastd m8, [o(pw_1697x16)]
pmulhrsw m4, m8, m0
pmulhrsw m5, m8, m1
@@ -1131,6 +1140,7 @@ cglobal idct_16x8_internal_10bpc, 0, 7, 16, dst, strid
call .transpose_16x8
jmp tx2q
.pass2:
+ _CET_ENDBR
lea r5, [o_base_8bpc]
call m(idct_16x8_internal_8bpc).main
movshdup m4, [permC]
@@ -1270,6 +1280,7 @@ cglobal iadst_16x8_internal_10bpc, 0, 7, 16, dst, stri
REPX {psrad x, 1}, m0, m4, m1, m5, m2, m6, m3, m7
jmp m(idct_16x8_internal_10bpc).pass1_end2
.pass2:
+ _CET_ENDBR
call .main_pass2
vpermq m8, m11, m0
vpermq m9, m11, m1
@@ -1436,6 +1447,7 @@ cglobal iflipadst_16x8_internal_10bpc, 0, 7, 16, dst,
psubd m0, m9, m8
jmp m(iadst_16x8_internal_10bpc).pass1_end
.pass2:
+ _CET_ENDBR
call m(iadst_16x8_internal_10bpc).main_pass2
psrlq m11, 8
vpermq m8, m11, m3
@@ -1465,6 +1477,7 @@ cglobal iidentity_16x8_internal_10bpc, 0, 7, 16, dst,
call m(idct_16x8_internal_10bpc).transpose_16x8
jmp tx2q
.pass2:
+ _CET_ENDBR
movshdup m4, [o(permC)]
vpbroadcastd m11, [o(pw_4096)]
mova m5, m4
@@ -1538,6 +1551,7 @@ cglobal idct_16x16_internal_10bpc, 0, 7, 16, dst, stri
jge .zero_loop
jmp tx2q
.pass2:
+ _CET_ENDBR
lea r5, [o_base_8bpc]
call m(idct_16x16_internal_8bpc).main
movshdup m12, [permC]
@@ -1801,6 +1815,7 @@ cglobal iadst_16x16_internal_10bpc, 0, 7, 16, dst, str
REPX {mova [cq+64*x], ym4}, 0, 1, 2, 3, 4, 5, 6, 7
jmp tx2q
.pass2:
+ _CET_ENDBR
lea r5, [o_base_8bpc]
call m(iadst_16x16_internal_8bpc).main_pass2b
movshdup m12, [permC]
@@ -2001,6 +2016,7 @@ cglobal iflipadst_16x16_internal_10bpc, 0, 7, 16, dst,
psubd m0, m9, m8
jmp m(iadst_16x16_internal_10bpc).pass1_fast_end
.pass2:
+ _CET_ENDBR
lea r5, [o_base_8bpc]
call m(iadst_16x16_internal_8bpc).main_pass2b
movshdup m12, [permC]
@@ -2064,6 +2080,7 @@ cglobal iidentity_16x16_internal_10bpc, 0, 7, 16, dst,
REPX {mova x, m7}, m4, m5, m6
jmp m(idct_16x16_internal_10bpc).pass1_end3
.pass2:
+ _CET_ENDBR
movshdup m14, [o(permC)]
vpbroadcastd m15, [o(pw_1697x16)]
lea r6, [strideq*3]
@@ -4434,7 +4451,6 @@ cglobal inv_txfm_add_dct_dct_32x64_10bpc, 4, 7, 0, dst
mova m10, m13
call .pass2_fast2_start
.end:
-
pxor m31, m31
.left_zero_loop:
@@ -5050,6 +5066,7 @@ ALIGN function_align
call .pass2
RET
.pass2:
+ _CET_ENDBR
psrlq m12, [permC], 24 ; 0 2 8 10 1 3 9 11
psrlq m13, m12, 32 ; 4 6 12 14 5 7 13 15
call m(inv_txfm_add_dct_dct_32x16_10bpc).transpose_16x32