315 lines
11 KiB
Text
315 lines
11 KiB
Text
Index: src/x86/itx_avx2.asm
|
|
--- src/x86/itx_avx2.asm.orig
|
|
+++ src/x86/itx_avx2.asm
|
|
@@ -440,6 +440,7 @@ cglobal idct_4x4_internal_8bpc, 0, 5, 6, dst, stride,
|
|
pshufb m1, m3, m2
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
IDCT4_1D_PACKED
|
|
pxor m2, m2
|
|
mova [cq+16*0], m2
|
|
@@ -461,6 +462,7 @@ cglobal iadst_4x4_internal_8bpc, 0, 5, 6, dst, stride,
|
|
punpcklwd m0, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
.end:
|
|
pxor m2, m2
|
|
@@ -488,6 +490,7 @@ cglobal iflipadst_4x4_internal_8bpc, 0, 5, 6, dst, str
|
|
punpckhwd m1, m2
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_4x4_internal_8bpc).main
|
|
.end:
|
|
pxor m2, m2
|
|
@@ -515,6 +518,7 @@ cglobal iidentity_4x4_internal_8bpc, 0, 5, 6, dst, str
|
|
punpcklwd m0, m2
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m3, [o(pw_1697x8)]
|
|
pmulhrsw m2, m3, m0
|
|
pmulhrsw m3, m1
|
|
@@ -692,6 +696,7 @@ cglobal idct_4x8_internal_8bpc, 0, 5, 7, dst, stride,
|
|
pshufb m1, m3, m2
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vextracti128 xm2, m0, 1
|
|
vextracti128 xm3, m1, 1
|
|
call .main
|
|
@@ -723,6 +728,7 @@ cglobal iadst_4x8_internal_8bpc, 0, 5, 7, dst, stride,
|
|
punpcklwd m0, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vextracti128 xm2, m0, 1
|
|
vextracti128 xm3, m1, 1
|
|
pshufd xm4, xm0, q1032
|
|
@@ -774,6 +780,7 @@ cglobal iflipadst_4x8_internal_8bpc, 0, 5, 7, dst, str
|
|
punpckhwd m1, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vextracti128 xm2, m0, 1
|
|
vextracti128 xm3, m1, 1
|
|
pshufd xm4, xm0, q1032
|
|
@@ -810,6 +817,7 @@ cglobal iidentity_4x8_internal_8bpc, 0, 5, 7, dst, str
|
|
paddsw m1, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m4, [o(pw_4096)]
|
|
jmp m(iadst_4x8_internal_8bpc).end2
|
|
|
|
@@ -924,6 +932,7 @@ cglobal idct_4x16_internal_8bpc, 0, 5, 11, dst, stride
|
|
punpckhdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vextracti128 xm4, m0, 1
|
|
vextracti128 xm5, m1, 1
|
|
vextracti128 xm6, m2, 1
|
|
@@ -965,6 +974,7 @@ cglobal iadst_4x16_internal_8bpc, 0, 5, 11, dst, strid
|
|
punpckhdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
vpbroadcastd m5, [o(pw_2896x8)]
|
|
paddsw m1, m2, m4
|
|
@@ -1094,6 +1104,7 @@ cglobal iflipadst_4x16_internal_8bpc, 0, 5, 11, dst, s
|
|
punpckldq m0, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_4x16_internal_8bpc).main
|
|
vpbroadcastd m5, [o(pw_2896x8)]
|
|
paddsw m1, m2, m4
|
|
@@ -1151,6 +1162,7 @@ cglobal iidentity_4x16_internal_8bpc, 0, 5, 11, dst, s
|
|
punpckhdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m8, [o(pw_1697x16)]
|
|
vpbroadcastd m5, [o(pw_2048)]
|
|
pmulhrsw m4, m8, m0
|
|
@@ -1221,6 +1233,7 @@ cglobal idct_8x4_internal_8bpc, 0, 5, 7, dst, stride,
|
|
pshufb m1, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
IDCT4_1D_PACKED
|
|
vpermq m0, m0, q3120
|
|
vpermq m1, m1, q2031
|
|
@@ -1250,6 +1263,7 @@ cglobal iadst_8x4_internal_8bpc, 0, 5, 7, dst, stride,
|
|
punpcklwd m0, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
.end:
|
|
vpermq m0, m0, q3120
|
|
@@ -1295,6 +1309,7 @@ cglobal iflipadst_8x4_internal_8bpc, 0, 5, 7, dst, str
|
|
punpcklwd m0, m3
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_8x4_internal_8bpc).main
|
|
mova m2, m1
|
|
vpermq m1, m0, q2031
|
|
@@ -1322,6 +1337,7 @@ cglobal iidentity_8x4_internal_8bpc, 0, 5, 7, dst, str
|
|
paddsw m1, m1
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m3, [o(pw_1697x8)]
|
|
pmulhrsw m2, m3, m0
|
|
pmulhrsw m3, m1
|
|
@@ -1379,6 +1395,7 @@ cglobal idct_8x8_internal_8bpc, 0, 5, 7, dst, stride,
|
|
vperm2i128 m3, m5, m3, 0x31
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
vpbroadcastd m4, [o(pw_2048)]
|
|
vpermq m0, m0, q3120
|
|
@@ -1423,6 +1440,7 @@ cglobal iadst_8x8_internal_8bpc, 0, 5, 7, dst, stride,
|
|
vinserti128 m1, m4, xm1, 1
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
pshufd m4, m0, q1032
|
|
pshufd m5, m1, q1032
|
|
call .main_pass2
|
|
@@ -1490,6 +1508,7 @@ cglobal iflipadst_8x8_internal_8bpc, 0, 5, 7, dst, str
|
|
vperm2i128 m2, m4, m2, 0x31
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
pshufd m4, m0, q1032
|
|
pshufd m5, m1, q1032
|
|
call m(iadst_8x8_internal_8bpc).main_pass2
|
|
@@ -1528,6 +1547,7 @@ cglobal iidentity_8x8_internal_8bpc, 0, 5, 7, dst, str
|
|
punpckhdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m4, [o(pw_4096)]
|
|
jmp m(iadst_8x8_internal_8bpc).end
|
|
|
|
@@ -1595,6 +1615,7 @@ cglobal idct_8x16_internal_8bpc, 0, 5, 13, dst, stride
|
|
punpckhdq m7, m8
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
REPX {vpermq x, x, q3120}, m0, m2, m4, m6
|
|
REPX {vpermq x, x, q2031}, m1, m3, m5, m7
|
|
@@ -1634,6 +1655,7 @@ cglobal iadst_8x16_internal_8bpc, 0, 5, 13, dst, strid
|
|
jmp m(idct_8x16_internal_8bpc).pass1_end
|
|
ALIGN function_align
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
call .main_pass2_end
|
|
vpbroadcastd m9, [o(pw_2048)]
|
|
@@ -1786,6 +1808,7 @@ cglobal iflipadst_8x16_internal_8bpc, 0, 5, 13, dst, s
|
|
punpckhwd m3, m1
|
|
jmp m(idct_8x16_internal_8bpc).pass1_end2
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_8x16_internal_8bpc).main
|
|
call m(iadst_8x16_internal_8bpc).main_pass2_end
|
|
vpbroadcastd m8, [o(pw_2048)]
|
|
@@ -1862,6 +1885,7 @@ cglobal iidentity_8x16_internal_8bpc, 0, 5, 13, dst, s
|
|
punpckhdq m7, m8
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m8, [o(pw_1697x16)]
|
|
REPX {vpermq x, x, q3120}, m0, m1, m2, m3, m4, m5, m6, m7
|
|
REPX {IDTX16 x, 9, 8}, 0, 1, 2, 3, 4, 5, 6, 7
|
|
@@ -1945,6 +1969,7 @@ cglobal idct_16x4_internal_8bpc, 0, 5, 11, dst, stride
|
|
mova m1, m6
|
|
jmp m(iadst_16x4_internal_8bpc).pass1_end
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
jmp m(iadst_16x4_internal_8bpc).end
|
|
ALIGN function_align
|
|
@@ -1990,6 +2015,7 @@ cglobal iadst_16x4_internal_8bpc, 0, 5, 11, dst, strid
|
|
punpckhdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
.end:
|
|
vpbroadcastd m4, [o(pw_2048)]
|
|
@@ -2082,6 +2108,7 @@ cglobal iflipadst_16x4_internal_8bpc, 0, 5, 11, dst, s
|
|
jmp m(iadst_16x4_internal_8bpc).pass1_end
|
|
ALIGN function_align
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_16x4_internal_8bpc).main
|
|
vpbroadcastd m4, [o(pw_2048)]
|
|
REPX {pmulhrsw x, m4}, m3, m2, m1, m0
|
|
@@ -2134,6 +2161,7 @@ cglobal iidentity_16x4_internal_8bpc, 0, 5, 11, dst, s
|
|
punpckhqdq m3, m4
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m7, [o(pw_1697x8)]
|
|
pmulhrsw m4, m7, m0
|
|
pmulhrsw m5, m7, m1
|
|
@@ -2218,6 +2246,7 @@ cglobal idct_16x8_internal_8bpc, 0, 5, 13, dst, stride
|
|
vinserti128 m3, xm9, 1
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
vpbroadcastd m8, [o(pw_2048)]
|
|
.end:
|
|
@@ -2264,6 +2293,7 @@ cglobal iadst_16x8_internal_8bpc, 0, 5, 13, dst, strid
|
|
jmp m(idct_16x8_internal_8bpc).pass1_end
|
|
ALIGN function_align
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
call .main_pass2_end
|
|
pxor m8, m8
|
|
@@ -2381,6 +2411,7 @@ cglobal iflipadst_16x8_internal_8bpc, 0, 5, 13, dst, s
|
|
vperm2i128 m7, m8, 0x31
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_16x8_internal_8bpc).main
|
|
call m(iadst_16x8_internal_8bpc).main_pass2_end
|
|
pxor m8, m8
|
|
@@ -2452,6 +2483,7 @@ cglobal iidentity_16x8_internal_8bpc, 0, 5, 13, dst, s
|
|
punpckhqdq m7, m8
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m8, [o(pw_4096)]
|
|
jmp m(idct_16x8_internal_8bpc).end
|
|
|
|
@@ -2576,6 +2608,7 @@ cglobal idct_16x16_internal_8bpc, 0, 5, 16, 32*3, dst,
|
|
punpcklqdq m6, m15
|
|
jmp tx2q
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
.end:
|
|
vpbroadcastd m1, [o(pw_2048)]
|
|
@@ -2656,6 +2689,7 @@ cglobal iadst_16x16_internal_8bpc, 0, 5, 16, 32*3, dst
|
|
jmp m(idct_16x16_internal_8bpc).pass1_end2
|
|
ALIGN function_align
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call .main
|
|
call .main_pass2_end
|
|
REPX {pmulhrsw x, m1}, m0, m2, m4, m6, m8, m10, m12, m14
|
|
@@ -2853,6 +2887,7 @@ cglobal iflipadst_16x16_internal_8bpc, 0, 5, 16, 32*3,
|
|
vperm2i128 m8, m8, [rsp+32*2], 0x31
|
|
jmp m(idct_16x16_internal_8bpc).pass1_end3
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
call m(iadst_16x16_internal_8bpc).main
|
|
call m(iadst_16x16_internal_8bpc).main_pass2_end
|
|
pmulhrsw m0, m1
|
|
@@ -2938,6 +2973,7 @@ cglobal iidentity_16x16_internal_8bpc, 0, 5, 16, 32*3,
|
|
jmp m(idct_16x16_internal_8bpc).pass1_end3
|
|
ALIGN function_align
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m15, [o(pw_1697x16)]
|
|
mova [rsp+32*1], m0
|
|
REPX {IDTX16 x, 0, 15}, 1, 2, 3, 4, 5, 6, 7, \
|
|
@@ -3125,6 +3161,7 @@ cglobal inv_txfm_add_dct_dct_8x32_8bpc, 4, 4, 0, dst,
|
|
pmulhrsw m11, m9, m8
|
|
call .main
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m12, [o(pw_2048)]
|
|
REPX {pmulhrsw x, m12}, m0, m1, m2, m3, m4, m5, m6, m7, \
|
|
m8, m9, m10, m11, m13, m14, m15
|
|
@@ -3335,6 +3372,7 @@ cglobal inv_txfm_add_dct_dct_32x8_8bpc, 4, 4, 0, dst,
|
|
REPX {mova [cq+32*x], m8}, 0, 1, 2, 3
|
|
call m(inv_txfm_add_dct_dct_8x32_8bpc).main
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m12, [o(pw_8192)]
|
|
REPX {pmulhrsw x, m12}, m8, m9, m10, m11, m13, m14, m15
|
|
mova [rsp+32*1], m9
|
|
@@ -4017,6 +4055,7 @@ cglobal inv_txfm_add_dct_dct_32x16_8bpc, 4, 4, 0, dst,
|
|
lea r2, [strideq*3]
|
|
mov r3, dstq
|
|
.pass2:
|
|
+ _CET_ENDBR
|
|
vpbroadcastd m7, [o(pw_16384)]
|
|
call m(inv_txfm_add_dct_dct_16x32_8bpc).transpose_2x8x8_round
|
|
call m(idct_16x16_internal_8bpc).main
|