375 lines
11 KiB
Text
375 lines
11 KiB
Text
|
Index: src/x86/ipred_avx512.asm
|
||
|
--- src/x86/ipred_avx512.asm.orig
|
||
|
+++ src/x86/ipred_avx512.asm
|
||
|
@@ -168,18 +168,23 @@ cglobal ipred_dc_left_8bpc, 3, 7, 5, dst, stride, tl,
|
||
|
add wq, r5
|
||
|
jmp r6
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
movu ym1, [tlq+32] ; unaligned when jumping here from dc_top
|
||
|
vpdpbusd ym0, ym1, ym2
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
vextracti32x4 xm1, ym0, 1
|
||
|
paddd xm0, xm1
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
punpckhqdq xm1, xm0, xm0
|
||
|
paddd xm0, xm1
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
psrlq xm1, xm0, 32
|
||
|
paddd xm0, xm1
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
vpsrlvd xm0, xmm3
|
||
|
lea stride3q, [strideq*3]
|
||
|
vpbroadcastb m0, xm0
|
||
|
@@ -204,10 +209,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp r6
|
||
|
.h4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xmm1, [tlq-4]
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movd xmm1, [tlq+1]
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
cmp hd, 4
|
||
|
@@ -228,6 +235,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
.w4_end:
|
||
|
vpbroadcastb xm0, xmm0
|
||
|
.s4:
|
||
|
+ _CET_ENDBR
|
||
|
movd [dstq+strideq*0], xm0
|
||
|
movd [dstq+strideq*1], xm0
|
||
|
movd [dstq+strideq*2], xm0
|
||
|
@@ -237,10 +245,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
jg .s4
|
||
|
RET
|
||
|
.h8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xmm1, [tlq-8]
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
jmp wq
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movq xmm1, [tlq+1]
|
||
|
vextracti32x4 xm2, ym0, 1
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
@@ -261,6 +271,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
.w8_end:
|
||
|
vpbroadcastb xm0, xmm0
|
||
|
.s8:
|
||
|
+ _CET_ENDBR
|
||
|
movq [dstq+strideq*0], xm0
|
||
|
movq [dstq+strideq*1], xm0
|
||
|
movq [dstq+strideq*2], xm0
|
||
|
@@ -270,10 +281,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
jg .s8
|
||
|
RET
|
||
|
.h16:
|
||
|
+ _CET_ENDBR
|
||
|
mova xmm1, [tlq-16]
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
jmp wq
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movu xmm1, [tlq+1]
|
||
|
vextracti32x4 xm2, ym0, 1
|
||
|
vpdpbusd xm0, xmm1, xm3
|
||
|
@@ -294,6 +307,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
.w16_end:
|
||
|
vpbroadcastb xm0, xmm0
|
||
|
.s16:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
mova [dstq+strideq*1], xm0
|
||
|
mova [dstq+strideq*2], xm0
|
||
|
@@ -303,10 +317,12 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
jg .s16
|
||
|
RET
|
||
|
.h32:
|
||
|
+ _CET_ENDBR
|
||
|
mova ym1, [tlq-32]
|
||
|
vpdpbusd ym0, ym1, ym3
|
||
|
jmp wq
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
movu ym1, [tlq+1]
|
||
|
vpdpbusd ym0, ym1, ym3
|
||
|
vextracti32x4 xm1, ym0, 1
|
||
|
@@ -326,6 +342,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
.w32_end:
|
||
|
vpbroadcastb ym0, xmm0
|
||
|
.s32:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], ym0
|
||
|
mova [dstq+strideq*1], ym0
|
||
|
mova [dstq+strideq*2], ym0
|
||
|
@@ -335,12 +352,14 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
jg .s32
|
||
|
RET
|
||
|
.h64:
|
||
|
+ _CET_ENDBR
|
||
|
mova ym1, [tlq-64]
|
||
|
mova ym2, [tlq-32]
|
||
|
vpdpbusd ym0, ym1, ym3
|
||
|
vpdpbusd ym0, ym2, ym3
|
||
|
jmp wq
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu ym1, [tlq+ 1]
|
||
|
movu ym2, [tlq+33]
|
||
|
vpdpbusd ym0, ym1, ym3
|
||
|
@@ -361,6 +380,7 @@ cglobal ipred_dc_8bpc, 3, 7, 5, dst, stride, tl, w, h,
|
||
|
.w64_end:
|
||
|
vpbroadcastb m0, xmm0
|
||
|
.s64:
|
||
|
+ _CET_ENDBR
|
||
|
mova [dstq+strideq*0], m0
|
||
|
mova [dstq+strideq*1], m0
|
||
|
mova [dstq+strideq*2], m0
|
||
|
@@ -401,6 +421,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
|
||
|
add wq, r6
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
mova xmm1, [base+ipred_h_shuf+16]
|
||
|
.w4_loop:
|
||
|
movd xmm0, [tlq+hq-4]
|
||
|
@@ -414,6 +435,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
|
||
|
jg .w4_loop
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movsldup xmm2, [base+ipred_h_shuf+16]
|
||
|
movshdup xmm3, [base+ipred_h_shuf+16]
|
||
|
.w8_loop:
|
||
|
@@ -429,6 +451,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
|
||
|
jg .w8_loop
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movsldup m1, [base+smooth_shuf]
|
||
|
.w16_loop:
|
||
|
vpbroadcastd m0, [tlq+hq-4]
|
||
|
@@ -442,6 +465,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
|
||
|
jg .w16
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd ym3, [base+pb_1]
|
||
|
vpord m2, m3, [base+pb_2] {1to16}
|
||
|
.w32_loop:
|
||
|
@@ -457,6 +481,7 @@ cglobal ipred_h_8bpc, 3, 7, 8, dst, stride, tl, w, h,
|
||
|
jg .w32_loop
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m4, [base+pb_3]
|
||
|
vpbroadcastd m5, [base+pb_2]
|
||
|
vpbroadcastd m6, [base+pb_1]
|
||
|
@@ -509,6 +534,7 @@ cglobal ipred_paeth_8bpc, 3, 7, 10, dst, stride, tl, w
|
||
|
jmp wq
|
||
|
INIT_YMM avx512icl
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m6, [topq]
|
||
|
mova m9, [ipred_h_shuf]
|
||
|
psubusb m7, m5, m6
|
||
|
@@ -536,6 +562,7 @@ INIT_YMM avx512icl
|
||
|
RET
|
||
|
INIT_ZMM avx512icl
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m6, [topq]
|
||
|
movsldup m9, [smooth_shuf]
|
||
|
psubusb m7, m5, m6
|
||
|
@@ -564,6 +591,7 @@ INIT_ZMM avx512icl
|
||
|
.w8_ret:
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x4 m6, [topq]
|
||
|
movsldup m9, [smooth_shuf]
|
||
|
psubusb m7, m5, m6
|
||
|
@@ -582,6 +610,7 @@ INIT_ZMM avx512icl
|
||
|
jg .w16_loop
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x8 m6, [topq]
|
||
|
mova ym9, ym8
|
||
|
psubusb m7, m5, m6
|
||
|
@@ -598,6 +627,7 @@ INIT_ZMM avx512icl
|
||
|
jg .w32_loop
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m6, [topq]
|
||
|
psubusb m7, m5, m6
|
||
|
psubusb m0, m6, m5
|
||
|
@@ -626,6 +656,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m2, [tlq+1]
|
||
|
movshdup m5, [smooth_shuf]
|
||
|
mova ym6, [smooth_endA]
|
||
|
@@ -656,6 +687,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
||
|
.ret:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m2, [tlq+1]
|
||
|
movshdup m5, [smooth_shuf]
|
||
|
mova ym6, [smooth_endA]
|
||
|
@@ -679,6 +711,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
||
|
jl .w8_loop
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x4 m3, [tlq+1]
|
||
|
movshdup m6, [smooth_shuf]
|
||
|
mova m7, [smooth_endB]
|
||
|
@@ -707,6 +740,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
||
|
jl .w16_loop
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x8 m3, [tlq+1]
|
||
|
movshdup m6, [smooth_shuf]
|
||
|
mova m7, [smooth_endB]
|
||
|
@@ -733,6 +767,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl,
|
||
|
jl .w32_loop
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m3, [tlq+1]
|
||
|
mova m6, [smooth_endB]
|
||
|
punpcklbw m2, m3, m4
|
||
|
@@ -772,6 +807,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
movsldup m3, [smooth_shuf]
|
||
|
vpbroadcastq m7, [smooth_weights+4*2]
|
||
|
mova ym8, [smooth_endA]
|
||
|
@@ -802,6 +838,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
|
||
|
.ret:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
movsldup m3, [smooth_shuf]
|
||
|
vbroadcasti32x4 m7, [smooth_weights+8*2]
|
||
|
mova ym8, [smooth_endA]
|
||
|
@@ -825,6 +862,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
|
||
|
jg .w8_loop
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
movsldup m7, [smooth_shuf]
|
||
|
vbroadcasti32x4 m8, [smooth_weights+16*2]
|
||
|
vbroadcasti32x4 m9, [smooth_weights+16*3]
|
||
|
@@ -850,6 +888,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
|
||
|
jg .w16_loop
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
mova m10, [smooth_endA]
|
||
|
vpbroadcastd ym7, [pb_1]
|
||
|
vbroadcasti32x8 m8, [smooth_weights+32*2]
|
||
|
@@ -874,6 +913,7 @@ cglobal ipred_smooth_h_8bpc, 4, 7, 11, dst, stride, tl
|
||
|
jg .w32_loop
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
mova m7, [smooth_weights+64*2]
|
||
|
mova m8, [smooth_weights+64*3]
|
||
|
mova m9, [smooth_endA]
|
||
|
@@ -912,6 +952,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastd m8, [tlq+hq+1]
|
||
|
movsldup m4, [smooth_shuf]
|
||
|
movshdup m5, [smooth_shuf]
|
||
|
@@ -954,6 +995,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
|
||
|
.ret:
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
vpbroadcastq m8, [tlq+hq+1]
|
||
|
movsldup m4, [smooth_shuf]
|
||
|
movshdup m5, [smooth_shuf]
|
||
|
@@ -988,6 +1030,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
|
||
|
jg .w8_loop
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x4 m9, [tlq+hq+1]
|
||
|
movsldup m5, [smooth_shuf]
|
||
|
movshdup m10, [smooth_shuf]
|
||
|
@@ -1031,6 +1074,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
|
||
|
jg .w16_loop
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
vbroadcasti32x8 m9, [tlq+hq+1]
|
||
|
movshdup m10, [smooth_shuf]
|
||
|
mova m12, [smooth_weights+32*2]
|
||
|
@@ -1073,6 +1117,7 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl,
|
||
|
jg .w32_loop
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
movu m9, [tlq+hq+1]
|
||
|
mova m11, [smooth_weights+64*2]
|
||
|
mova m2, [smooth_weights+64*3]
|
||
|
@@ -1122,6 +1167,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
|
||
|
lea stride3q, [strideq*3]
|
||
|
jmp wq
|
||
|
.w4:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb xmm0, xm4, [idxq]
|
||
|
add idxq, 16
|
||
|
movd [dstq+strideq*0], xmm0
|
||
|
@@ -1133,6 +1179,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
|
||
|
jg .w4
|
||
|
RET
|
||
|
.w8:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb xmm0, xm4, [idxq+16*0]
|
||
|
pshufb xmm1, xm4, [idxq+16*1]
|
||
|
add idxq, 16*2
|
||
|
@@ -1145,6 +1192,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
|
||
|
jg .w8
|
||
|
RET
|
||
|
.w16:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq]
|
||
|
add idxq, 64
|
||
|
mova [dstq+strideq*0], xm0
|
||
|
@@ -1156,6 +1204,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
|
||
|
jg .w16
|
||
|
RET
|
||
|
.w32:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq+64*0]
|
||
|
pshufb m1, m4, [idxq+64*1]
|
||
|
add idxq, 64*2
|
||
|
@@ -1168,6 +1217,7 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx,
|
||
|
jg .w32
|
||
|
RET
|
||
|
.w64:
|
||
|
+ _CET_ENDBR
|
||
|
pshufb m0, m4, [idxq+64*0]
|
||
|
pshufb m1, m4, [idxq+64*1]
|
||
|
pshufb m2, m4, [idxq+64*2]
|