560 lines
17 KiB
Text
560 lines
17 KiB
Text
|
Index: src/x86/ipred16_sse.asm
|
||
|
--- src/x86/ipred16_sse.asm.orig
|
||
|
+++ src/x86/ipred16_sse.asm
|
||
|
@@ -143,6 +143,7 @@ cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl,
|
||
|
add wq, r5
|
||
|
jmp r6
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m2, [tlq+112]
|
||
|
movu m1, [tlq+ 96]
|
||
|
paddw m0, m2
|
||
|
@@ -152,17 +153,21 @@ cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl,
|
||
|
paddw m0, m2
|
||
|
paddw m0, m1
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 48]
|
||
|
movu m2, [tlq+ 32]
|
||
|
paddw m1, m2
|
||
|
paddw m0, m1
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 16]
|
||
|
paddw m0, m1
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
movhlps m1, m0
|
||
|
paddw m0, m1
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
punpcklwd m0, m3
|
||
|
paddd m4, m0
|
||
|
punpckhqdq m0, m0
|
||
|
@@ -193,9 +198,11 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp r6
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [tlq-8]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m1, [tlq+2]
|
||
|
paddw m1, m0
|
||
|
punpckhwd m0, m3
|
||
|
@@ -222,6 +229,7 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
.w4_end:
|
||
|
pshuflw m0, m0, q0000
|
||
|
.s4:
|
||
|
+ _CET_ENDBR
|
||
|
movq [dstq+strideq*0], m0
|
||
|
movq [dstq+strideq*1], m0
|
||
|
movq [dstq+strideq*2], m0
|
||
|
@@ -231,9 +239,11 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
jg .s4
|
||
|
RET
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+2]
|
||
|
paddw m0, m1
|
||
|
punpcklwd m1, m0, m3
|
||
|
@@ -258,6 +268,7 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s8:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], m0
|
||
|
mova [dstq+strideq*1], m0
|
||
|
mova [dstq+strideq*2], m0
|
||
|
@@ -267,10 +278,12 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
jg .s8
|
||
|
RET
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-32]
|
||
|
paddw m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 2]
|
||
|
movu m2, [tlq+18]
|
||
|
paddw m1, m2
|
||
|
@@ -297,8 +310,10 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s16c:
|
||
|
+ _CET_ENDBR
|
||
|
mova m1, m0
|
||
|
.s16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0+16*0], m0
|
||
|
mova [dstq+strideq*0+16*1], m1
|
||
|
mova [dstq+strideq*1+16*0], m0
|
||
|
@@ -312,12 +327,14 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
jg .s16
|
||
|
RET
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-64]
|
||
|
paddw m0, [tlq-48]
|
||
|
paddw m0, [tlq-32]
|
||
|
paddw m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 2]
|
||
|
movu m2, [tlq+18]
|
||
|
paddw m1, m2
|
||
|
@@ -348,10 +365,12 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s32c:
|
||
|
+ _CET_ENDBR
|
||
|
mova m1, m0
|
||
|
mova m2, m0
|
||
|
mova m3, m0
|
||
|
.s32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0+16*0], m0
|
||
|
mova [dstq+strideq*0+16*1], m1
|
||
|
mova [dstq+strideq*0+16*2], m2
|
||
|
@@ -365,6 +384,7 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
jg .s32
|
||
|
RET
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-128]
|
||
|
mova m1, [tlq-112]
|
||
|
paddw m0, [tlq- 96]
|
||
|
@@ -376,6 +396,7 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
paddw m0, m1
|
||
|
jmp wq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 2]
|
||
|
movu m2, [tlq+ 18]
|
||
|
paddw m1, m2
|
||
|
@@ -414,6 +435,7 @@ cglobal ipred_dc_16bpc, 4, 7, 6, dst, stride, tl, w, h
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s64:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+16*0], m0
|
||
|
mova [dstq+16*1], m0
|
||
|
mova [dstq+16*2], m0
|
||
|
@@ -454,6 +476,7 @@ cglobal ipred_v_16bpc, 4, 7, 6, dst, stride, tl, w, h,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
WIN64_SPILL_XMM 8
|
||
|
movu m4, [tlq+ 66]
|
||
|
movu m5, [tlq+ 82]
|
||
|
@@ -485,6 +508,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
sub tlq, 8
|
||
|
movq m3, [tlq]
|
||
|
pshuflw m0, m3, q3333
|
||
|
@@ -500,6 +524,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
jg .w4
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
sub tlq, 8
|
||
|
movq m3, [tlq]
|
||
|
punpcklwd m3, m3
|
||
|
@@ -516,6 +541,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
jg .w8
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
sub tlq, 4
|
||
|
movd m1, [tlq]
|
||
|
pshufb m0, m1, m3
|
||
|
@@ -529,6 +555,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
jg .w16
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
sub tlq, 4
|
||
|
movd m1, [tlq]
|
||
|
pshufb m0, m1, m3
|
||
|
@@ -546,6 +573,7 @@ cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h,
|
||
|
jg .w32
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
sub tlq, 2
|
||
|
movd m0, [tlq]
|
||
|
pshufb m0, m2
|
||
|
@@ -603,6 +631,7 @@ cglobal ipred_paeth_16bpc, 4, 6, 8, dst, stride, tl, w
|
||
|
jg .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
PUSH r6
|
||
|
%define r7d hm
|
||
|
@@ -682,6 +711,7 @@ cglobal ipred_smooth_v_16bpc, 4, 6, 6, dst, stride, tl
|
||
|
jl .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
PUSH r6
|
||
|
%assign regs_used 7
|
||
|
@@ -752,6 +782,7 @@ cglobal ipred_smooth_h_16bpc, 3, 6, 6, dst, stride, tl
|
||
|
jg .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea weightsq, [weightsq+wq*4]
|
||
|
neg wq
|
||
|
%if ARCH_X86_32
|
||
|
@@ -838,6 +869,7 @@ cglobal ipred_smooth_16bpc, 3, 7, 8, dst, stride, tl,
|
||
|
jl .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
lea h_weightsq, [h_weightsq+wq*4]
|
||
|
mov t0, tlq
|
||
|
@@ -947,6 +979,7 @@ cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride,
|
||
|
xor angled, 0x4ff ; d = 90 - angle
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [angleq+88]
|
||
|
test r3d, 0x480
|
||
|
jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
|
||
|
@@ -1089,6 +1122,7 @@ cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride,
|
||
|
.w4_end:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [angleq+88]
|
||
|
and r3d, ~0x7f
|
||
|
or r3d, hd
|
||
|
@@ -1239,6 +1273,7 @@ cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride,
|
||
|
.w8_end:
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
%define strideq r3
|
||
|
%endif
|
||
|
@@ -1337,6 +1372,7 @@ cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride,
|
||
|
.w16_end:
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+31]
|
||
|
and r3d, 31
|
||
|
or r3d, 32 ; imin(h+31, 63)
|
||
|
@@ -1424,6 +1460,7 @@ cglobal ipred_z1_16bpc, 3, 7, 8, -16*18, dst, stride,
|
||
|
.w32_end:
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3d, [hq+63]
|
||
|
test angled, 0x400 ; !enable_intra_edge_filter
|
||
|
jnz .w64_main
|
||
|
@@ -1720,6 +1757,7 @@ cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w
|
||
|
movq [rsp+8*2], m7
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
test angled, 0x400
|
||
|
jnz .w4_main
|
||
|
lea r3d, [hq+2]
|
||
|
@@ -2024,6 +2062,7 @@ cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w
|
||
|
.w4_ret:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
test angled, 0x400
|
||
|
jnz .w4_main
|
||
|
lea r3d, [angleq+126]
|
||
|
@@ -2122,6 +2161,7 @@ cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w
|
||
|
.w8_filter_top_end:
|
||
|
ret
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
test angled, 0x400
|
||
|
jnz .w4_main
|
||
|
lea r3d, [hq+15]
|
||
|
@@ -2175,6 +2215,7 @@ cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w
|
||
|
call mangle(private_prefix %+ _ipred_z3_16bpc_ssse3).filter_edge
|
||
|
jmp .filter_left_end
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+16*2+2]
|
||
|
movu m2, [tlq+16*3+2]
|
||
|
mova [rsp+16*16], m1
|
||
|
@@ -2237,6 +2278,7 @@ cglobal ipred_z2_16bpc, 4, 7, 8, -16*27, dst, _, tl, w
|
||
|
movu [rsp+r2*2+16* 5], m2
|
||
|
jmp .w4_main
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+16*2+2]
|
||
|
movu m2, [tlq+16*3+2]
|
||
|
movu m3, [tlq+16*4+2]
|
||
|
@@ -2315,6 +2357,7 @@ cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride,
|
||
|
movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
|
||
|
jmp hq
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [angleq+88]
|
||
|
test r4d, 0x480
|
||
|
jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
|
||
|
@@ -2461,6 +2504,7 @@ cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride,
|
||
|
or r3d, 4*2
|
||
|
jmp .end_transpose
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [angleq+88]
|
||
|
and r4d, ~0x7f
|
||
|
or r4d, wd
|
||
|
@@ -2619,6 +2663,7 @@ cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride,
|
||
|
or r3d, 8*2
|
||
|
jmp .end_transpose
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [wq+15]
|
||
|
movd m1, r4d
|
||
|
and r4d, 15
|
||
|
@@ -2715,6 +2760,7 @@ cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride,
|
||
|
or r3d, 16*2
|
||
|
jmp .end_transpose
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [wq+31]
|
||
|
and r4d, 31
|
||
|
or r4d, 32 ; imin(w+31, 63)
|
||
|
@@ -2802,6 +2848,7 @@ cglobal ipred_z3_16bpc, 4, 7, 8, -16*18, dst, stride,
|
||
|
or r3d, 32*2
|
||
|
jmp .end_transpose
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
lea r4d, [wq+63]
|
||
|
test angled, 0x400 ; !enable_intra_edge_filter
|
||
|
jnz .h64_main
|
||
|
@@ -3218,17 +3265,21 @@ cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl
|
||
|
punpcklqdq m7, m7
|
||
|
jmp r6
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+48]
|
||
|
movu m2, [tlq+32]
|
||
|
paddw m0, m1
|
||
|
paddw m0, m2
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+16]
|
||
|
paddw m0, m1
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
pshufd m1, m0, q1032
|
||
|
paddw m0, m1
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
pmaddwd m0, m3
|
||
|
psubd m4, m0
|
||
|
pshuflw m0, m4, q1032
|
||
|
@@ -3270,9 +3321,11 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
punpcklqdq m7, m7
|
||
|
jmp r6
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m0, [tlq-8]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movq m1, [tlq+2]
|
||
|
paddw m0, m1
|
||
|
pmaddwd m0, m3
|
||
|
@@ -3298,6 +3351,7 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s4:
|
||
|
+ _CET_ENDBR
|
||
|
movd m1, alpham
|
||
|
lea r6, [strideq*3]
|
||
|
pshuflw m1, m1, q0000
|
||
|
@@ -3319,9 +3373,11 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
jg .s4_loop
|
||
|
RET
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+2]
|
||
|
paddw m0, m1
|
||
|
pmaddwd m0, m3
|
||
|
@@ -3344,6 +3400,7 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s8:
|
||
|
+ _CET_ENDBR
|
||
|
movd m1, alpham
|
||
|
pshuflw m1, m1, q0000
|
||
|
punpcklqdq m1, m1
|
||
|
@@ -3362,10 +3419,12 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
jg .s8_loop
|
||
|
RET
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-32]
|
||
|
paddw m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 2]
|
||
|
movu m2, [tlq+18]
|
||
|
paddw m1, m2
|
||
|
@@ -3390,6 +3449,7 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s16:
|
||
|
+ _CET_ENDBR
|
||
|
movd m1, alpham
|
||
|
pshuflw m1, m1, q0000
|
||
|
punpcklqdq m1, m1
|
||
|
@@ -3408,12 +3468,14 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
jg .s16_loop
|
||
|
RET
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [tlq-64]
|
||
|
paddw m0, [tlq-48]
|
||
|
paddw m0, [tlq-32]
|
||
|
paddw m0, [tlq-16]
|
||
|
jmp wq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu m1, [tlq+ 2]
|
||
|
movu m2, [tlq+18]
|
||
|
paddw m1, m2
|
||
|
@@ -3442,6 +3504,7 @@ cglobal ipred_cfl_16bpc, 4, 7, 8, dst, stride, tl, w,
|
||
|
pshuflw m0, m0, q0000
|
||
|
punpcklqdq m0, m0
|
||
|
.s32:
|
||
|
+ _CET_ENDBR
|
||
|
movd m1, alpham
|
||
|
pshuflw m1, m1, q0000
|
||
|
punpcklqdq m1, m1
|
||
|
@@ -3528,6 +3591,7 @@ cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
jg .w4_hpad
|
||
|
jmp .dc
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
cmp dword wpadm, 0
|
||
|
%else
|
||
|
@@ -3582,6 +3646,7 @@ cglobal ipred_cfl_ac_420_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
pshufd m2, m1, q3333
|
||
|
jmp .w16_wpad_end
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movifnidn wpadd, wpadm
|
||
|
WIN64_SPILL_XMM 7
|
||
|
.w16_loop:
|
||
|
@@ -3698,6 +3763,7 @@ cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
add acq, 16*4
|
||
|
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
%if ARCH_X86_32
|
||
|
cmp dword wpadm, 0
|
||
|
%else
|
||
|
@@ -3758,6 +3824,7 @@ cglobal ipred_cfl_ac_422_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
pshufd m2, m1, q3333
|
||
|
jmp .w16_wpad_end
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movifnidn wpadd, wpadm
|
||
|
WIN64_SPILL_XMM 7
|
||
|
.w16_loop:
|
||
|
@@ -3802,6 +3869,7 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
sub hd, hpadd
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
lea r3, [strideq*3]
|
||
|
mov r5, acq
|
||
|
.w4_loop:
|
||
|
@@ -3834,6 +3902,7 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
add acq, 16*4
|
||
|
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mov r5, acq
|
||
|
.w8_loop:
|
||
|
mova m0, [ypxq+strideq*0]
|
||
|
@@ -3863,6 +3932,7 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
punpckhqdq m1, m1
|
||
|
jmp .w16_wpad_end
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movifnidn wpadd, wpadm
|
||
|
mov r5, acq
|
||
|
.w16_loop:
|
||
|
@@ -3913,6 +3983,7 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stri
|
||
|
punpckhqdq m3, m3
|
||
|
jmp .w32_wpad_end
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movifnidn wpadd, wpadm
|
||
|
mov r5, acq
|
||
|
WIN64_SPILL_XMM 8
|
||
|
@@ -3979,6 +4050,7 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx
|
||
|
movifnidn hd, hm
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [idxq]
|
||
|
add idxq, 16
|
||
|
pshufb m1, m3, m0
|
||
|
@@ -3995,6 +4067,7 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx
|
||
|
jg .w4
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [idxq]
|
||
|
add idxq, 16
|
||
|
pshufb m1, m3, m0
|
||
|
@@ -4008,6 +4081,7 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx
|
||
|
jg .w8
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [idxq]
|
||
|
add idxq, 16
|
||
|
pshufb m1, m3, m0
|
||
|
@@ -4021,6 +4095,7 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx
|
||
|
jg .w16
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [idxq+16*0]
|
||
|
pshufb m1, m3, m0
|
||
|
pshufb m2, m4, m0
|
||
|
@@ -4041,6 +4116,7 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx
|
||
|
jg .w32
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mova m0, [idxq+16*0]
|
||
|
pshufb m1, m3, m0
|
||
|
pshufb m2, m4, m0
|