ports/multimedia/dav1d/patches/patch-src_x86_itx16_sse_asm

513 lines
15 KiB
Text

Index: src/x86/itx16_sse.asm
--- src/x86/itx16_sse.asm.orig
+++ src/x86/itx16_sse.asm
@@ -434,6 +434,7 @@ cglobal idct_4x4_internal_16bpc, 0, 0, 0, dst, stride,
IDCT4_1D 0, 1, 2, 3, 4, 6, 7, 5
ret
.pass2:
+ _CET_ENDBR
; m0 = in0 in1
; m1 = in2 in3
; m5 = pd_2048
@@ -494,6 +495,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride
; m5 = pd_2048
jmp tx2q
.pass2:
+ _CET_ENDBR
; m0 = in0 in1
; m1 = in2 in3
%if ARCH_X86_32
@@ -528,6 +530,7 @@ cglobal iadst_4x4_internal_16bpc, 0, 0, 0, dst, stride
RET
ALIGN function_align
.main:
+ _CET_ENDBR
mova m1, [cq+16*2]
mova m3, [cq+16*3]
mova m5, [cq+16*0]
@@ -581,6 +584,7 @@ cglobal iflipadst_4x4_internal_16bpc, 0, 0, 0, dst, st
; m5 = pd_2048
jmp tx2q
.pass2:
+ _CET_ENDBR
; m0 = in0 in1
; m1 = in2 in3
%if ARCH_X86_32
@@ -639,6 +643,7 @@ cglobal iidentity_4x4_internal_16bpc, 0, 0, 0, dst, st
; m5 = pd_2048
jmp tx2q
.pass2:
+ _CET_ENDBR
; m0 = in0 in1
; m1 = in2 in3
; m5 = pd_2048
@@ -735,6 +740,7 @@ cglobal idct_4x8_internal_16bpc, 0, 0, 0, dst, stride,
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -828,6 +834,7 @@ cglobal iadst_4x8_internal_16bpc, 0, 0, 0, dst, stride
.end_pass1:
ret
.pass2:
+ _CET_ENDBR
shufps m0, m0, q1032
shufps m1, m1, q1032
%if ARCH_X86_32
@@ -857,6 +864,7 @@ cglobal iflipadst_4x8_internal_16bpc, 0, 0, 0, dst, st
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
+ _CET_ENDBR
shufps m0, m0, q1032
shufps m1, m1, q1032
%if ARCH_X86_32
@@ -925,6 +933,7 @@ cglobal iidentity_4x8_internal_16bpc, 0, 0, 0, dst, st
; m0-3 = packed & transposed output
jmp tx2q
.pass2:
+ _CET_ENDBR
mova m4, [o(pw_4096)]
jmp m(idct_4x8_internal_16bpc).end
@@ -993,6 +1002,7 @@ cglobal idct_4x16_internal_16bpc, 0, 0, 0, dst, stride
; m0-7 = packed & transposed output
jmp tx2q
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1092,6 +1102,7 @@ cglobal iadst_4x16_internal_16bpc, 0, 0, 0, dst, strid
sub r5d, 16
jmp .loop_pass1
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1165,6 +1176,7 @@ cglobal iflipadst_4x16_internal_16bpc, 0, 0, 0, dst, s
sub r5d, 16
jmp .loop_pass1
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1238,6 +1250,7 @@ cglobal iidentity_4x16_internal_16bpc, 0, 0, 0, dst, s
sub r5d, 16
jmp .loop_pass1
.pass2:
+ _CET_ENDBR
mova [cq+16*4], m0
mova [cq+16*5], m1
mova [cq+16*6], m2
@@ -1361,6 +1374,7 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride,
punpcklwd m0, m6
ret
.main:
+ _CET_ENDBR
call .main_pass1
call .round
packssdw m0, m1
@@ -1526,6 +1540,7 @@ cglobal idct_8x4_internal_16bpc, 0, 0, 0, dst, stride,
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1563,6 +1578,7 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
+ _CET_ENDBR
call .main_pass1
call .round
packssdw m0, m1
@@ -1700,6 +1716,7 @@ cglobal iadst_8x4_internal_16bpc, 0, 0, 0, dst, stride
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1715,6 +1732,7 @@ cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, st
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
+ _CET_ENDBR
call m(iadst_8x4_internal_16bpc).main_pass1
call m(iadst_8x4_internal_16bpc).round
packssdw m7, m6
@@ -1727,6 +1745,7 @@ cglobal iflipadst_8x4_internal_16bpc, 0, 0, 0, dst, st
mova m6, m1
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1745,6 +1764,7 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, st
lea r5, [o(.main)]
jmp m(idct_8x4_internal_16bpc).pass1_entry
.main:
+ _CET_ENDBR
REPX {paddd x, x}, m0, m1, m2, m3, m4, m5, m6, m7
packssdw m0, m1
packssdw m2, m3
@@ -1752,6 +1772,7 @@ cglobal iidentity_8x4_internal_16bpc, 0, 0, 0, dst, st
packssdw m6, m7
ret
.pass2:
+ _CET_ENDBR
mova m7, [o(pw_1697x8)]
pmulhrsw m4, m7, m0
pmulhrsw m5, m7, m1
@@ -1870,12 +1891,14 @@ cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride,
%endif
jmp tx2q
.pass1_main:
+ _CET_ENDBR
call m(idct_8x4_internal_16bpc).main_pass1
pcmpeqd m1, m1
REPX {psubd x, m1}, m0, m6, m5, m3
call m(idct_8x4_internal_16bpc).round
REPX {psrad x, 1 }, m0, m1, m2, m3, m4, m5, m6, m7
.pack_and_transpose:
+ _CET_ENDBR
packssdw m2, m3
packssdw m6, m7
packssdw m0, m1
@@ -1883,6 +1906,7 @@ cglobal idct_8x8_internal_16bpc, 0, 0, 0, dst, stride,
jmp m(idct_8x4_internal_16bpc).transpose4x8packed
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -1981,6 +2005,7 @@ cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride
lea t0, [o(.pass1_main)]
jmp m(idct_8x8_internal_16bpc).pass1_full
.pass1_main:
+ _CET_ENDBR
call m(iadst_8x4_internal_16bpc).main_pass1
call .round
jmp m(idct_8x8_internal_16bpc).pack_and_transpose
@@ -2014,6 +2039,7 @@ cglobal iadst_8x8_internal_16bpc, 0, 0, 0, dst, stride
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -2065,6 +2091,7 @@ cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, st
lea t0, [o(.pass1_main)]
jmp m(idct_8x8_internal_16bpc).pass1_full
.pass1_main:
+ _CET_ENDBR
call m(iadst_8x4_internal_16bpc).main_pass1
call m(iadst_8x8_internal_16bpc).round
; invert registers
@@ -2079,6 +2106,7 @@ cglobal iflipadst_8x8_internal_16bpc, 0, 0, 0, dst, st
jmp m(idct_8x4_internal_16bpc).transpose4x8packed
.pass2:
+ _CET_ENDBR
lea dstq, [dstq+strideq*8]
sub dstq, strideq
neg strideq
@@ -2110,6 +2138,7 @@ cglobal iidentity_8x8_internal_16bpc, 0, 0, 0, dst, st
jmp m_suffix(idct_8x8_internal_8bpc, _ssse3).pass1_end3
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -2211,6 +2240,7 @@ cglobal idct_8x16_internal_16bpc, 0, 0, 0, dst, stride
jmp tx2q
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -2299,6 +2329,7 @@ cglobal iadst_8x16_internal_16bpc, 0, 0, 0, dst, strid
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -2380,6 +2411,7 @@ cglobal iflipadst_8x16_internal_16bpc, 0, 0, 0, dst, s
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
+ _CET_ENDBR
lea r3, [strideq*3]
lea r3, [r3*5]
add dstq, r3
@@ -2402,6 +2434,7 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, s
jmp m(idct_8x16_internal_16bpc).pass1_full
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m4, [o(pw_2048)]
mova m5, [o(pixel_10bpc_max)]
@@ -2430,6 +2463,7 @@ cglobal iidentity_8x16_internal_16bpc, 0, 0, 0, dst, s
.end:
RET
.main:
+ _CET_ENDBR
; y = pmulhrsw(x, pw_1697x16); x = paddsw(x, x); x = paddsw(x, y)
%if ARCH_X86_32
mova m7, [o(pw_1697x16)]
@@ -2836,6 +2870,7 @@ cglobal idct_16x4_internal_16bpc, 0, 0, 0, dst, stride
ret
.pass2:
+ _CET_ENDBR
lea r4, [o(m_suffix(idct_8x4_internal_8bpc, _ssse3).main)]
.pass2_loop:
lea r3, [strideq*3]
@@ -2890,6 +2925,7 @@ cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, strid
%endif
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
@@ -3312,6 +3348,7 @@ cglobal iadst_16x4_internal_16bpc, 0, 0, 0, dst, strid
ret
.pass2:
+ _CET_ENDBR
lea r4, [o(m_suffix(iadst_8x4_internal_8bpc, _ssse3).main)]
jmp m(idct_16x4_internal_16bpc).pass2_loop
@@ -3364,6 +3401,7 @@ cglobal iflipadst_16x4_internal_16bpc, 0, 0, 0, dst, s
%endif
.pass2:
+ _CET_ENDBR
lea r3, [strideq*3]
lea dstq, [dstq+r3]
neg strideq
@@ -3438,12 +3476,14 @@ cglobal iidentity_16x4_internal_16bpc, 0, 0, 0, dst, s
%endif
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m12, [o(pw_1697x8)]
%endif
lea r4, [o(.main)]
jmp m(idct_16x4_internal_16bpc).pass2_loop
.main:
+ _CET_ENDBR
%if ARCH_X86_64
pmulhrsw m4, m0, m12
pmulhrsw m5, m1, m12
@@ -3543,6 +3583,7 @@ cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride
jmp tx2q
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
@@ -3585,6 +3626,7 @@ cglobal idct_16x8_internal_16bpc, 0, 0, 0, dst, stride
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
@@ -3638,6 +3680,7 @@ cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, strid
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
@@ -3684,6 +3727,7 @@ cglobal iadst_16x8_internal_16bpc, 0, 0, 0, dst, strid
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
@@ -3737,6 +3781,7 @@ cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, s
lea t0, [o(.main)]
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
+ _CET_ENDBR
call m(iadst_16x8_internal_16bpc).main
%if ARCH_X86_64
pshufd m1, m0, q1032
@@ -3768,6 +3813,7 @@ cglobal iflipadst_16x8_internal_16bpc, 0, 0, 0, dst, s
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
@@ -3791,6 +3837,7 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, s
lea t0, [o(.main)]
jmp m(idct_16x8_internal_16bpc).loop_main
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m15, [o(pd_2896)]
pmulld m0, m15, [cq+ 0*32+r5]
@@ -3888,6 +3935,7 @@ cglobal iidentity_16x8_internal_16bpc, 0, 0, 0, dst, s
%endif
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
mov strideq, [rsp+gprsize+12*16]
%endif
@@ -4013,6 +4061,7 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, strid
%endif
jmp tx2q
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
@@ -4147,6 +4196,7 @@ cglobal idct_16x16_internal_16bpc, 0, 0, 0, dst, strid
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
@@ -4246,6 +4296,7 @@ cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stri
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m11, [o(pd_2048)]
mova m12, [o(clip_18b_min)]
@@ -4352,6 +4403,7 @@ cglobal iadst_16x16_internal_16bpc, 0, 0, 0, dst, stri
%endif
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m8, [o(pw_2048)]
mova m11, [o(pw_m2048)]
@@ -4452,6 +4504,7 @@ cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst,
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
+ _CET_ENDBR
call m(iadst_16x16_internal_16bpc).main
%if ARCH_X86_64
mova m1, m0
@@ -4483,6 +4536,7 @@ cglobal iflipadst_16x16_internal_16bpc, 0, 0, 0, dst,
ret
.pass2:
+ _CET_ENDBR
lea r3, [strideq*3]
lea r3, [r3*5]
add dstq, r3
@@ -4503,6 +4557,7 @@ cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst,
jmp m(idct_16x16_internal_16bpc).pass1_full
.main:
+ _CET_ENDBR
%if ARCH_X86_64
mova m15, [o(pd_11586)]
pmulld m0, m15, [cq+ 0*64+r5]
@@ -4581,6 +4636,7 @@ cglobal iidentity_16x16_internal_16bpc, 0, 0, 0, dst,
ret
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m4, [o(pw_2048)]
mova m5, [o(pixel_10bpc_max)]
@@ -4659,6 +4715,7 @@ ALIGN function_align
.main_zero:
REPX {mova [cq+128*x], m6}, 0, 1, 2, 3, 4, 5, 6, 7
.main:
+ _CET_ENDBR
punpckhwd m4, m0, m1
punpcklwd m0, m1
punpckhwd m1, m2, m3
@@ -4770,6 +4827,7 @@ cglobal inv_txfm_add_identity_identity_16x32_16bpc, 4,
RET
ALIGN function_align
.main:
+ _CET_ENDBR
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
@@ -4858,6 +4916,7 @@ cglobal inv_txfm_add_identity_identity_32x16_16bpc, 4,
RET
ALIGN function_align
.main:
+ _CET_ENDBR
mova m0, [cq+64*0]
packssdw m0, [cq+64*1]
mova m1, [cq+64*2]
@@ -4970,6 +5029,7 @@ ALIGN function_align
sub cq, 128*8
sub dstq, 16
.main:
+ _CET_ENDBR
mova m0, [cq+128*0]
packssdw m0, [cq+128*1]
mova m1, [cq+128*2]
@@ -5093,6 +5153,7 @@ cglobal inv_txfm_add_dct_dct_8x32_16bpc, 4, 7, 15, 0-3
RET
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -6256,6 +6317,7 @@ cglobal inv_txfm_add_dct_dct_32x16_16bpc, 4, 7, 16, 0-
RET
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9
@@ -6770,6 +6832,7 @@ cglobal inv_txfm_add_dct_dct_16x64_16bpc, 4, 7, 16, \
RET
.pass2:
+ _CET_ENDBR
%if ARCH_X86_32
lea r5, [o(itx8_start)]
%endif
@@ -7220,6 +7283,7 @@ cglobal inv_txfm_add_dct_dct_64x16_16bpc, 4, 7, 16, 0-
RET
.pass2:
+ _CET_ENDBR
%if ARCH_X86_64
mova m8, [o(pw_2048)]
pxor m9, m9