703 lines
21 KiB
Text
703 lines
21 KiB
Text
Index: src/x86/ipred_sse.asm
|
|
--- src/x86/ipred_sse.asm.orig
|
|
+++ src/x86/ipred_sse.asm
|
|
@@ -207,14 +207,19 @@ cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h,
|
|
lea stride3q, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
IPRED_H 4
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
IPRED_H 8
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
IPRED_H 16
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
IPRED_H 32
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
IPRED_H 64
|
|
|
|
;---------------------------------------------------------------------------------------
|
|
@@ -257,10 +262,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
|
lea stride3q, [strideq*3]
|
|
jmp r6
|
|
.h4:
|
|
+ _CET_ENDBR
|
|
movd m0, [tlq-4]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movd m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
psubw m0, m4
|
|
@@ -286,6 +293,7 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
|
pxor m1, m1
|
|
pshufb m0, m1
|
|
.s4:
|
|
+ _CET_ENDBR
|
|
movd [dstq+strideq*0], m0
|
|
movd [dstq+strideq*1], m0
|
|
movd [dstq+strideq*2], m0
|
|
@@ -296,10 +304,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h,
|
|
RET
|
|
ALIGN function_align
|
|
.h8:
|
|
+ _CET_ENDBR
|
|
movq m0, [tlq-8]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movq m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
psubw m4, m0
|
|
@@ -322,6 +332,7 @@ ALIGN function_align
|
|
pxor m1, m1
|
|
pshufb m0, m1
|
|
.s8:
|
|
+ _CET_ENDBR
|
|
movq [dstq+strideq*0], m0
|
|
movq [dstq+strideq*1], m0
|
|
movq [dstq+strideq*2], m0
|
|
@@ -332,10 +343,12 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.h16:
|
|
+ _CET_ENDBR
|
|
mova m0, [tlq-16]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
paddw m0, m1
|
|
@@ -358,6 +371,7 @@ ALIGN function_align
|
|
pxor m1, m1
|
|
pshufb m0, m1
|
|
.s16:
|
|
+ _CET_ENDBR
|
|
mova [dstq+strideq*0], m0
|
|
mova [dstq+strideq*1], m0
|
|
mova [dstq+strideq*2], m0
|
|
@@ -368,6 +382,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.h32:
|
|
+ _CET_ENDBR
|
|
mova m0, [tlq-32]
|
|
pmaddubsw m0, m3
|
|
mova m2, [tlq-16]
|
|
@@ -375,6 +390,7 @@ ALIGN function_align
|
|
paddw m0, m2
|
|
jmp wq
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
movu m2, [tlq+17]
|
|
@@ -402,6 +418,7 @@ ALIGN function_align
|
|
pshufb m0, m1
|
|
mova m1, m0
|
|
.s32:
|
|
+ _CET_ENDBR
|
|
mova [dstq], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+strideq], m0
|
|
@@ -416,6 +433,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.h64:
|
|
+ _CET_ENDBR
|
|
mova m0, [tlq-64]
|
|
mova m1, [tlq-48]
|
|
pmaddubsw m0, m3
|
|
@@ -429,6 +447,7 @@ ALIGN function_align
|
|
paddw m0, m1
|
|
jmp wq
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+ 1]
|
|
movu m2, [tlq+17]
|
|
pmaddubsw m1, m3
|
|
@@ -463,6 +482,7 @@ ALIGN function_align
|
|
mova m2, m0
|
|
mova m3, m0
|
|
.s64:
|
|
+ _CET_ENDBR
|
|
mova [dstq], m0
|
|
mova [dstq+16], m1
|
|
mova [dstq+32], m2
|
|
@@ -499,6 +519,7 @@ cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl,
|
|
add wq, r5
|
|
jmp r6
|
|
.h64:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+48] ; unaligned when jumping here from dc_top
|
|
pmaddubsw m1, m2
|
|
paddw m0, m1
|
|
@@ -506,16 +527,20 @@ cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl,
|
|
pmaddubsw m1, m2
|
|
paddw m0, m1
|
|
.h32:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+16] ; unaligned when jumping here from dc_top
|
|
pmaddubsw m1, m2
|
|
paddw m0, m1
|
|
.h16:
|
|
+ _CET_ENDBR
|
|
pshufd m1, m0, q3232 ; psrlq m1, m0, 16
|
|
paddw m0, m1
|
|
.h8:
|
|
+ _CET_ENDBR
|
|
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
|
|
paddw m0, m1
|
|
.h4:
|
|
+ _CET_ENDBR
|
|
pmaddwd m0, m2
|
|
pmulhrsw m0, m3
|
|
lea stride3q, [strideq*3]
|
|
@@ -598,6 +623,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
|
add wq, r6
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movd m2, [tlq+1]
|
|
punpckldq m2, m2
|
|
punpcklbw m2, m5 ; top, bottom
|
|
@@ -627,6 +653,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
|
RET
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movq m2, [tlq+1]
|
|
punpcklbw m2, m5
|
|
mova m5, [base+ipred_v_shuf]
|
|
@@ -649,6 +676,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movu m3, [tlq+1]
|
|
punpcklbw m2, m3, m5
|
|
punpckhbw m3, m5
|
|
@@ -670,6 +698,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
movaps [rsp+24], xmm7
|
|
%define xmm_regs_used 8
|
|
@@ -705,6 +734,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
%if WIN64
|
|
movaps [rsp+24], xmm7
|
|
%define xmm_regs_used 8
|
|
@@ -758,6 +788,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl,
|
|
add wq, r6
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movddup m6, [base+smooth_weights+4*2]
|
|
mova m7, [base+ipred_h_shuf]
|
|
sub tlq, 4
|
|
@@ -794,6 +825,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl,
|
|
RET
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova m6, [base+smooth_weights+8*2]
|
|
mova m7, [base+ipred_h_shuf]
|
|
sub tlq, 4
|
|
@@ -825,6 +857,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova m6, [base+smooth_weights+16*2]
|
|
mova m7, [base+smooth_weights+16*3]
|
|
sub tlq, 1
|
|
@@ -855,6 +888,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
sub tlq, 1
|
|
sub tlq, hq
|
|
pxor m6, m6
|
|
@@ -893,6 +927,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
sub tlq, 1
|
|
sub tlq, hq
|
|
pxor m6, m6
|
|
@@ -1020,6 +1055,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, strid
|
|
lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height]
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
mova m7, [base+ipred_v_shuf]
|
|
movd m1, [tlq+1] ; left
|
|
pshufd m1, m1, q0000
|
|
@@ -1077,6 +1113,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, strid
|
|
RET
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
mova m7, [base+ipred_v_shuf]
|
|
movq m1, [tlq+1] ; left
|
|
punpcklqdq m1, m1
|
|
@@ -1128,6 +1165,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
mova m7, [base+ipred_v_shuf]
|
|
movu m1, [tlq+1] ; left
|
|
sub tlq, 4
|
|
@@ -1179,6 +1217,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1] ; top topleft[1 + x]
|
|
movu m2, [tlq+17] ; top
|
|
mova [rsp+16*0], m1
|
|
@@ -1202,6 +1241,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1] ; top topleft[1 + x]
|
|
movu m2, [tlq+17] ; top
|
|
mova [rsp+16*0], m1
|
|
@@ -1263,6 +1303,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w,
|
|
xor angled, 0x4ff ; d = 90 - angle
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
lea r3d, [angleq+88]
|
|
test r3d, 0x480
|
|
jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40
|
|
@@ -1416,6 +1457,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w,
|
|
.w4_end:
|
|
RET
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
lea r3d, [angleq+88]
|
|
and r3d, ~0x7f
|
|
or r3d, hd
|
|
@@ -1567,6 +1609,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w,
|
|
.w8_end:
|
|
RET
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
lea r3d, [hq+15]
|
|
movd m0, r3d
|
|
and r3d, 15
|
|
@@ -1669,6 +1712,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w,
|
|
.w16_end:
|
|
RET
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
lea r3d, [hq+31]
|
|
and r3d, 31
|
|
or r3d, 32 ; imin(h+31, 63)
|
|
@@ -1780,6 +1824,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w,
|
|
.w32_end:
|
|
RET
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
lea r3d, [hq+63]
|
|
test angled, 0x400 ; !enable_intra_edge_filter
|
|
jnz .w64_main
|
|
@@ -2084,6 +2129,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w,
|
|
mov r11d, (128-4)<<6
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
test angled, 0x400
|
|
jnz .w4_main
|
|
movd m5, [tlq+4]
|
|
@@ -2342,6 +2388,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w,
|
|
.w4_ret:
|
|
RET
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
test angled, 0x400
|
|
jnz .w4_main
|
|
movd m5, [tlq+8]
|
|
@@ -2454,6 +2501,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w,
|
|
.w8_filter_top_end:
|
|
ret
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
test angled, 0x400
|
|
jnz .w4_main
|
|
lea r3d, [hq+15]
|
|
@@ -2525,6 +2573,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w,
|
|
adc r5, -4 ; filter_strength-3
|
|
jmp .filter_left
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
test angled, 0x400
|
|
jnz .w4_main
|
|
pshufb m6, [base+z_filter_t_w16] ; tlq[32]
|
|
@@ -2546,6 +2595,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w,
|
|
movu [rsp+r2+16*9], m1
|
|
jmp .filter_left
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
movu m0, [tlq+16*2+1]
|
|
movu m1, [tlq+16*3+1]
|
|
mova [rsp+16*10], m0
|
|
@@ -2653,6 +2703,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t
|
|
movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq]
|
|
jmp hq
|
|
.h4:
|
|
+ _CET_ENDBR
|
|
lea r4d, [angleq+88]
|
|
test r4d, 0x480
|
|
jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40
|
|
@@ -2825,6 +2876,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t
|
|
jg .h4_transpose_loop
|
|
RET
|
|
.h8:
|
|
+ _CET_ENDBR
|
|
lea r4d, [angleq+88]
|
|
and r4d, ~0x7f
|
|
or r4d, wd
|
|
@@ -3016,6 +3068,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t
|
|
movd [dstq+strideq*0], m1
|
|
ret
|
|
.h16:
|
|
+ _CET_ENDBR
|
|
lea r4d, [wq+15]
|
|
movd m0, r4d
|
|
and r4d, 15
|
|
@@ -3147,6 +3200,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t
|
|
punpcklwd m1, m2, m3
|
|
jmp .write_4x8_end
|
|
.h32:
|
|
+ _CET_ENDBR
|
|
lea r4d, [wq+31]
|
|
and r4d, 31
|
|
or r4d, 32 ; imin(w+31, 63)
|
|
@@ -3258,6 +3312,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t
|
|
or r3d, 32
|
|
jmp .end_transpose_main
|
|
.h64:
|
|
+ _CET_ENDBR
|
|
lea r4d, [wq+63]
|
|
test angled, 0x400 ; !enable_intra_edge_filter
|
|
jnz .h64_main
|
|
@@ -3494,6 +3549,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx,
|
|
lea r2, [strideq*3]
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
pshufb m0, m4, [idxq]
|
|
add idxq, 16
|
|
movd [dstq ], m0
|
|
@@ -3509,6 +3565,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx,
|
|
RET
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
pshufb m0, m4, [idxq]
|
|
pshufb m1, m4, [idxq+16]
|
|
add idxq, 32
|
|
@@ -3522,6 +3579,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
pshufb m0, m4, [idxq]
|
|
pshufb m1, m4, [idxq+16]
|
|
pshufb m2, m4, [idxq+32]
|
|
@@ -3537,6 +3595,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
pshufb m0, m4, [idxq]
|
|
pshufb m1, m4, [idxq+16]
|
|
pshufb m2, m4, [idxq+32]
|
|
@@ -3552,6 +3611,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
pshufb m0, m4, [idxq]
|
|
pshufb m1, m4, [idxq+16]
|
|
pshufb m2, m4, [idxq+32]
|
|
@@ -3603,10 +3663,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
|
movifnidn acq, acmp
|
|
jmp r6
|
|
.h4:
|
|
+ _CET_ENDBR
|
|
movd m0, [tlq-4]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movd m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
psubw m0, m4
|
|
@@ -3632,6 +3694,7 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
|
pshuflw m0, m0, q0000
|
|
punpcklqdq m0, m0
|
|
.s4:
|
|
+ _CET_ENDBR
|
|
movd m1, alpham
|
|
pshuflw m1, m1, q0000
|
|
punpcklqdq m1, m1
|
|
@@ -3658,10 +3721,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h
|
|
RET
|
|
ALIGN function_align
|
|
.h8:
|
|
+ _CET_ENDBR
|
|
movq m0, [tlq-8]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movq m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
psubw m4, m0
|
|
@@ -3684,6 +3749,7 @@ ALIGN function_align
|
|
pshuflw m0, m0, q0000
|
|
punpcklqdq m0, m0
|
|
.s8:
|
|
+ _CET_ENDBR
|
|
movd m1, alpham
|
|
pshuflw m1, m1, q0000
|
|
punpcklqdq m1, m1
|
|
@@ -3712,10 +3778,12 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.h16:
|
|
+ _CET_ENDBR
|
|
mova m0, [tlq-16]
|
|
pmaddubsw m0, m3
|
|
jmp wq
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
paddw m0, m1
|
|
@@ -3738,6 +3806,7 @@ ALIGN function_align
|
|
pshuflw m0, m0, q0000
|
|
punpcklqdq m0, m0
|
|
.s16:
|
|
+ _CET_ENDBR
|
|
movd m1, alpham
|
|
pshuflw m1, m1, q0000
|
|
punpcklqdq m1, m1
|
|
@@ -3763,6 +3832,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.h32:
|
|
+ _CET_ENDBR
|
|
mova m0, [tlq-32]
|
|
pmaddubsw m0, m3
|
|
mova m2, [tlq-16]
|
|
@@ -3770,6 +3840,7 @@ ALIGN function_align
|
|
paddw m0, m2
|
|
jmp wq
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+1]
|
|
pmaddubsw m1, m3
|
|
movu m2, [tlq+17]
|
|
@@ -3796,6 +3867,7 @@ ALIGN function_align
|
|
pshuflw m0, m0, q0000
|
|
punpcklqdq m0, m0
|
|
.s32:
|
|
+ _CET_ENDBR
|
|
movd m1, alpham
|
|
pshuflw m1, m1, q0000
|
|
punpcklqdq m1, m1
|
|
@@ -3845,16 +3917,20 @@ cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl,
|
|
movifnidn acq, acmp
|
|
jmp r6
|
|
.h32:
|
|
+ _CET_ENDBR
|
|
movu m1, [tlq+16] ; unaligned when jumping here from dc_top
|
|
pmaddubsw m1, m2
|
|
paddw m0, m1
|
|
.h16:
|
|
+ _CET_ENDBR
|
|
pshufd m1, m0, q3232 ; psrlq m1, m0, 16
|
|
paddw m0, m1
|
|
.h8:
|
|
+ _CET_ENDBR
|
|
pshuflw m1, m0, q1032 ; psrlq m1, m0, 32
|
|
paddw m0, m1
|
|
.h4:
|
|
+ _CET_ENDBR
|
|
pmaddwd m0, m2
|
|
pmulhrsw m0, m3
|
|
pshuflw m0, m0, q0000
|
|
@@ -3937,6 +4013,7 @@ DECLARE_REG_TMP 4
|
|
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
|
|
%endif
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
.w4_loop:
|
|
movq m0, [yq]
|
|
@@ -3963,6 +4040,7 @@ DECLARE_REG_TMP 4
|
|
jg .w4_hpad_loop
|
|
jmp .calc_avg_4_8
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
test wpadd, wpadd
|
|
jnz .w8_wpad
|
|
@@ -4011,6 +4089,7 @@ DECLARE_REG_TMP 4
|
|
jg .w8_hpad
|
|
jmp .calc_avg_4_8
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
test wpadd, wpadd
|
|
jnz .w16_wpad
|
|
.w16_loop:
|
|
@@ -4176,6 +4255,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride,
|
|
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
|
|
%endif
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
.w4_loop:
|
|
movq m1, [yq]
|
|
@@ -4203,6 +4283,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride,
|
|
jg .w4_hpad_loop
|
|
jmp .calc_avg_4
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
test wpadd, wpadd
|
|
jnz .w8_wpad
|
|
@@ -4257,6 +4338,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride,
|
|
jg .w8_hpad
|
|
jmp .calc_avg_8_16
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
test wpadd, wpadd
|
|
jnz .w16_wpad
|
|
.w16_loop:
|
|
@@ -4452,6 +4534,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y,
|
|
DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h
|
|
%endif
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
.w4_loop:
|
|
movd m1, [yq]
|
|
@@ -4487,6 +4570,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y,
|
|
jmp .calc_avg
|
|
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
lea stride3q, [strideq*3]
|
|
test wpadd, wpadd
|
|
jnz .w8_wpad
|
|
@@ -4550,6 +4634,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y,
|
|
jmp .calc_avg_8_16
|
|
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
test wpadd, wpadd
|
|
jnz .w16_wpad
|
|
.w16_loop:
|
|
@@ -4694,6 +4779,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y,
|
|
jmp .calc_avg
|
|
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
pxor m0, m0
|
|
mova [rsp ], m0
|
|
mova [rsp+16], m0
|
|
@@ -5071,6 +5157,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride,
|
|
add wq, r5
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
movd m6, [tlq+1] ; top
|
|
pshufd m6, m6, q0000
|
|
lea r3, [strideq*3]
|
|
@@ -5096,6 +5183,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride,
|
|
RET
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movddup m6, [tlq+1]
|
|
psubusb m7, m5, m6
|
|
psubusb m0, m6, m5
|
|
@@ -5113,6 +5201,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movu m6, [tlq+1]
|
|
psubusb m7, m5, m6
|
|
psubusb m0, m6, m5
|
|
@@ -5130,6 +5219,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movu m6, [tlq+1]
|
|
psubusb m7, m5, m6
|
|
psubusb m0, m6, m5
|
|
@@ -5158,6 +5248,7 @@ ALIGN function_align
|
|
RET
|
|
ALIGN function_align
|
|
.w64:
|
|
+ _CET_ENDBR
|
|
movu m6, [tlq+1]
|
|
psubusb m7, m5, m6
|
|
psubusb m0, m6, m5
|
|
@@ -5247,6 +5338,7 @@ cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w
|
|
mov hd, hm
|
|
jmp wq
|
|
.w4:
|
|
+ _CET_ENDBR
|
|
mova m1, [base+filter_shuf1]
|
|
sub tlq, 3
|
|
sub tlq, hq
|
|
@@ -5266,6 +5358,7 @@ cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w
|
|
|
|
ALIGN function_align
|
|
.w8:
|
|
+ _CET_ENDBR
|
|
movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4
|
|
sub tlq, 5
|
|
sub tlq, hq
|
|
@@ -5290,6 +5383,7 @@ ALIGN function_align
|
|
|
|
ALIGN function_align
|
|
.w16:
|
|
+ _CET_ENDBR
|
|
movu m6, [tlq+1] ;top row
|
|
sub tlq, 5
|
|
sub tlq, hq
|
|
@@ -5329,6 +5423,7 @@ ALIGN function_align
|
|
|
|
ALIGN function_align
|
|
.w32:
|
|
+ _CET_ENDBR
|
|
movu m6, [tlq+1] ;top row
|
|
lea filterq, [tlq+17]
|
|
sub tlq, 5
|