Index: src/x86/ipred_sse.asm --- src/x86/ipred_sse.asm.orig +++ src/x86/ipred_sse.asm @@ -207,14 +207,19 @@ cglobal ipred_h_8bpc, 3, 6, 2, dst, stride, tl, w, h, lea stride3q, [strideq*3] jmp wq .w4: + _CET_ENDBR IPRED_H 4 .w8: + _CET_ENDBR IPRED_H 8 .w16: + _CET_ENDBR IPRED_H 16 .w32: + _CET_ENDBR IPRED_H 32 .w64: + _CET_ENDBR IPRED_H 64 ;--------------------------------------------------------------------------------------- @@ -257,10 +262,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, lea stride3q, [strideq*3] jmp r6 .h4: + _CET_ENDBR movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: + _CET_ENDBR movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 @@ -286,6 +293,7 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, pxor m1, m1 pshufb m0, m1 .s4: + _CET_ENDBR movd [dstq+strideq*0], m0 movd [dstq+strideq*1], m0 movd [dstq+strideq*2], m0 @@ -296,10 +304,12 @@ cglobal ipred_dc_8bpc, 3, 7, 6, dst, stride, tl, w, h, RET ALIGN function_align .h8: + _CET_ENDBR movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: + _CET_ENDBR movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 @@ -322,6 +332,7 @@ ALIGN function_align pxor m1, m1 pshufb m0, m1 .s8: + _CET_ENDBR movq [dstq+strideq*0], m0 movq [dstq+strideq*1], m0 movq [dstq+strideq*2], m0 @@ -332,10 +343,12 @@ ALIGN function_align RET ALIGN function_align .h16: + _CET_ENDBR mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: + _CET_ENDBR movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 @@ -358,6 +371,7 @@ ALIGN function_align pxor m1, m1 pshufb m0, m1 .s16: + _CET_ENDBR mova [dstq+strideq*0], m0 mova [dstq+strideq*1], m0 mova [dstq+strideq*2], m0 @@ -368,6 +382,7 @@ ALIGN function_align RET ALIGN function_align .h32: + _CET_ENDBR mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] @@ -375,6 +390,7 @@ ALIGN function_align paddw m0, m2 jmp wq .w32: + _CET_ENDBR movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] @@ -402,6 +418,7 @@ ALIGN function_align pshufb m0, m1 mova m1, m0 .s32: + _CET_ENDBR mova [dstq], m0 mova [dstq+16], m1 mova [dstq+strideq], m0 @@ -416,6 +433,7 @@ ALIGN function_align RET ALIGN function_align .h64: + _CET_ENDBR mova m0, [tlq-64] mova m1, [tlq-48] pmaddubsw m0, m3 @@ -429,6 +447,7 @@ ALIGN function_align paddw m0, m1 jmp wq .w64: + _CET_ENDBR movu m1, [tlq+ 1] movu m2, [tlq+17] pmaddubsw m1, m3 @@ -463,6 +482,7 @@ ALIGN function_align mova m2, m0 mova m3, m0 .s64: + _CET_ENDBR mova [dstq], m0 mova [dstq+16], m1 mova [dstq+32], m2 @@ -499,6 +519,7 @@ cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, add wq, r5 jmp r6 .h64: + _CET_ENDBR movu m1, [tlq+48] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 @@ -506,16 +527,20 @@ cglobal ipred_dc_left_8bpc, 3, 7, 6, dst, stride, tl, pmaddubsw m1, m2 paddw m0, m1 .h32: + _CET_ENDBR movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: + _CET_ENDBR pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: + _CET_ENDBR pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: + _CET_ENDBR pmaddwd m0, m2 pmulhrsw m0, m3 lea stride3q, [strideq*3] @@ -598,6 +623,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, add wq, r6 jmp wq .w4: + _CET_ENDBR movd m2, [tlq+1] punpckldq m2, m2 punpcklbw m2, m5 ; top, bottom @@ -627,6 +653,7 @@ cglobal ipred_smooth_v_8bpc, 3, 7, 7, dst, stride, tl, RET ALIGN function_align .w8: + _CET_ENDBR movq m2, [tlq+1] punpcklbw m2, m5 mova m5, [base+ipred_v_shuf] @@ -649,6 +676,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR movu m3, [tlq+1] punpcklbw m2, m3, m5 punpckhbw m3, m5 @@ -670,6 +698,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 @@ -705,6 +734,7 @@ ALIGN function_align RET ALIGN function_align .w64: + _CET_ENDBR %if WIN64 movaps [rsp+24], xmm7 %define xmm_regs_used 8 @@ -758,6 +788,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, add wq, r6 jmp wq .w4: + _CET_ENDBR movddup m6, [base+smooth_weights+4*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 @@ -794,6 +825,7 @@ cglobal ipred_smooth_h_8bpc, 3, 7, 8, dst, stride, tl, RET ALIGN function_align .w8: + _CET_ENDBR mova m6, [base+smooth_weights+8*2] mova m7, [base+ipred_h_shuf] sub tlq, 4 @@ -825,6 +857,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR mova m6, [base+smooth_weights+16*2] mova m7, [base+smooth_weights+16*3] sub tlq, 1 @@ -855,6 +888,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR sub tlq, 1 sub tlq, hq pxor m6, m6 @@ -893,6 +927,7 @@ ALIGN function_align RET ALIGN function_align .w64: + _CET_ENDBR sub tlq, 1 sub tlq, hq pxor m6, m6 @@ -1020,6 +1055,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, strid lea v_weightsq, [base+smooth_weights+hq*2] ; weights_ver = &dav1d_sm_weights[height] jmp wq .w4: + _CET_ENDBR mova m7, [base+ipred_v_shuf] movd m1, [tlq+1] ; left pshufd m1, m1, q0000 @@ -1077,6 +1113,7 @@ cglobal ipred_smooth_8bpc, 3, 7, 8, -13*16, dst, strid RET ALIGN function_align .w8: + _CET_ENDBR mova m7, [base+ipred_v_shuf] movq m1, [tlq+1] ; left punpcklqdq m1, m1 @@ -1128,6 +1165,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR mova m7, [base+ipred_v_shuf] movu m1, [tlq+1] ; left sub tlq, 4 @@ -1179,6 +1217,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 @@ -1202,6 +1241,7 @@ ALIGN function_align RET ALIGN function_align .w64: + _CET_ENDBR movu m1, [tlq+1] ; top topleft[1 + x] movu m2, [tlq+17] ; top mova [rsp+16*0], m1 @@ -1263,6 +1303,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, xor angled, 0x4ff ; d = 90 - angle jmp wq .w4: + _CET_ENDBR lea r3d, [angleq+88] test r3d, 0x480 jnz .w4_no_upsample ; !enable_intra_edge_filter || angle >= 40 @@ -1416,6 +1457,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, .w4_end: RET .w8: + _CET_ENDBR lea r3d, [angleq+88] and r3d, ~0x7f or r3d, hd @@ -1567,6 +1609,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, .w8_end: RET .w16: + _CET_ENDBR lea r3d, [hq+15] movd m0, r3d and r3d, 15 @@ -1669,6 +1712,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, .w16_end: RET .w32: + _CET_ENDBR lea r3d, [hq+31] and r3d, 31 or r3d, 32 ; imin(h+31, 63) @@ -1780,6 +1824,7 @@ cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, _, tl, w, .w32_end: RET .w64: + _CET_ENDBR lea r3d, [hq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .w64_main @@ -2084,6 +2129,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, mov r11d, (128-4)<<6 jmp wq .w4: + _CET_ENDBR test angled, 0x400 jnz .w4_main movd m5, [tlq+4] @@ -2342,6 +2388,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, .w4_ret: RET .w8: + _CET_ENDBR test angled, 0x400 jnz .w4_main movd m5, [tlq+8] @@ -2454,6 +2501,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, .w8_filter_top_end: ret .w16: + _CET_ENDBR test angled, 0x400 jnz .w4_main lea r3d, [hq+15] @@ -2525,6 +2573,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, adc r5, -4 ; filter_strength-3 jmp .filter_left .w32: + _CET_ENDBR test angled, 0x400 jnz .w4_main pshufb m6, [base+z_filter_t_w16] ; tlq[32] @@ -2546,6 +2595,7 @@ cglobal ipred_z2_8bpc, 4, 7, 8, -16*20, dst, _, tl, w, movu [rsp+r2+16*9], m1 jmp .filter_left .w64: + _CET_ENDBR movu m0, [tlq+16*2+1] movu m1, [tlq+16*3+1] mova [rsp+16*10], m0 @@ -2653,6 +2703,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t movzx dyd, word [base+dr_intra_derivative+45*2-1+dyq] jmp hq .h4: + _CET_ENDBR lea r4d, [angleq+88] test r4d, 0x480 jnz .h4_no_upsample ; !enable_intra_edge_filter || angle >= 40 @@ -2825,6 +2876,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t jg .h4_transpose_loop RET .h8: + _CET_ENDBR lea r4d, [angleq+88] and r4d, ~0x7f or r4d, wd @@ -3016,6 +3068,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t movd [dstq+strideq*0], m1 ret .h16: + _CET_ENDBR lea r4d, [wq+15] movd m0, r4d and r4d, 15 @@ -3147,6 +3200,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t punpcklwd m1, m2, m3 jmp .write_4x8_end .h32: + _CET_ENDBR lea r4d, [wq+31] and r4d, 31 or r4d, 32 ; imin(w+31, 63) @@ -3258,6 +3312,7 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, t or r3d, 32 jmp .end_transpose_main .h64: + _CET_ENDBR lea r4d, [wq+63] test angled, 0x400 ; !enable_intra_edge_filter jnz .h64_main @@ -3494,6 +3549,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, lea r2, [strideq*3] jmp wq .w4: + _CET_ENDBR pshufb m0, m4, [idxq] add idxq, 16 movd [dstq ], m0 @@ -3509,6 +3565,7 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, RET ALIGN function_align .w8: + _CET_ENDBR pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] add idxq, 32 @@ -3522,6 +3579,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] @@ -3537,6 +3595,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] @@ -3552,6 +3611,7 @@ ALIGN function_align RET ALIGN function_align .w64: + _CET_ENDBR pshufb m0, m4, [idxq] pshufb m1, m4, [idxq+16] pshufb m2, m4, [idxq+32] @@ -3603,10 +3663,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h movifnidn acq, acmp jmp r6 .h4: + _CET_ENDBR movd m0, [tlq-4] pmaddubsw m0, m3 jmp wq .w4: + _CET_ENDBR movd m1, [tlq+1] pmaddubsw m1, m3 psubw m0, m4 @@ -3632,6 +3694,7 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s4: + _CET_ENDBR movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 @@ -3658,10 +3721,12 @@ cglobal ipred_cfl_8bpc, 3, 7, 6, dst, stride, tl, w, h RET ALIGN function_align .h8: + _CET_ENDBR movq m0, [tlq-8] pmaddubsw m0, m3 jmp wq .w8: + _CET_ENDBR movq m1, [tlq+1] pmaddubsw m1, m3 psubw m4, m0 @@ -3684,6 +3749,7 @@ ALIGN function_align pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s8: + _CET_ENDBR movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 @@ -3712,10 +3778,12 @@ ALIGN function_align RET ALIGN function_align .h16: + _CET_ENDBR mova m0, [tlq-16] pmaddubsw m0, m3 jmp wq .w16: + _CET_ENDBR movu m1, [tlq+1] pmaddubsw m1, m3 paddw m0, m1 @@ -3738,6 +3806,7 @@ ALIGN function_align pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s16: + _CET_ENDBR movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 @@ -3763,6 +3832,7 @@ ALIGN function_align RET ALIGN function_align .h32: + _CET_ENDBR mova m0, [tlq-32] pmaddubsw m0, m3 mova m2, [tlq-16] @@ -3770,6 +3840,7 @@ ALIGN function_align paddw m0, m2 jmp wq .w32: + _CET_ENDBR movu m1, [tlq+1] pmaddubsw m1, m3 movu m2, [tlq+17] @@ -3796,6 +3867,7 @@ ALIGN function_align pshuflw m0, m0, q0000 punpcklqdq m0, m0 .s32: + _CET_ENDBR movd m1, alpham pshuflw m1, m1, q0000 punpcklqdq m1, m1 @@ -3845,16 +3917,20 @@ cglobal ipred_cfl_left_8bpc, 3, 7, 6, dst, stride, tl, movifnidn acq, acmp jmp r6 .h32: + _CET_ENDBR movu m1, [tlq+16] ; unaligned when jumping here from dc_top pmaddubsw m1, m2 paddw m0, m1 .h16: + _CET_ENDBR pshufd m1, m0, q3232 ; psrlq m1, m0, 16 paddw m0, m1 .h8: + _CET_ENDBR pshuflw m1, m0, q1032 ; psrlq m1, m0, 32 paddw m0, m1 .h4: + _CET_ENDBR pmaddwd m0, m2 pmulhrsw m0, m3 pshuflw m0, m0, q0000 @@ -3937,6 +4013,7 @@ DECLARE_REG_TMP 4 DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: + _CET_ENDBR lea stride3q, [strideq*3] .w4_loop: movq m0, [yq] @@ -3963,6 +4040,7 @@ DECLARE_REG_TMP 4 jg .w4_hpad_loop jmp .calc_avg_4_8 .w8: + _CET_ENDBR lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad @@ -4011,6 +4089,7 @@ DECLARE_REG_TMP 4 jg .w8_hpad jmp .calc_avg_4_8 .w16: + _CET_ENDBR test wpadd, wpadd jnz .w16_wpad .w16_loop: @@ -4176,6 +4255,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: + _CET_ENDBR lea stride3q, [strideq*3] .w4_loop: movq m1, [yq] @@ -4203,6 +4283,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, jg .w4_hpad_loop jmp .calc_avg_4 .w8: + _CET_ENDBR lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad @@ -4257,6 +4338,7 @@ cglobal ipred_cfl_ac_422_8bpc, 4, 7, 7, ac, y, stride, jg .w8_hpad jmp .calc_avg_8_16 .w16: + _CET_ENDBR test wpadd, wpadd jnz .w16_wpad .w16_loop: @@ -4452,6 +4534,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, DEFINE_ARGS ac, y, stride, wpad, hpad, stride3, h %endif .w4: + _CET_ENDBR lea stride3q, [strideq*3] .w4_loop: movd m1, [yq] @@ -4487,6 +4570,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, jmp .calc_avg .w8: + _CET_ENDBR lea stride3q, [strideq*3] test wpadd, wpadd jnz .w8_wpad @@ -4550,6 +4634,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, jmp .calc_avg_8_16 .w16: + _CET_ENDBR test wpadd, wpadd jnz .w16_wpad .w16_loop: @@ -4694,6 +4779,7 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 7, 7, -5*16, ac, y, jmp .calc_avg .w32: + _CET_ENDBR pxor m0, m0 mova [rsp ], m0 mova [rsp+16], m0 @@ -5071,6 +5157,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, add wq, r5 jmp wq .w4: + _CET_ENDBR movd m6, [tlq+1] ; top pshufd m6, m6, q0000 lea r3, [strideq*3] @@ -5096,6 +5183,7 @@ cglobal ipred_paeth_8bpc, 3, 6, 8, -7*16, dst, stride, RET ALIGN function_align .w8: + _CET_ENDBR movddup m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 @@ -5113,6 +5201,7 @@ ALIGN function_align RET ALIGN function_align .w16: + _CET_ENDBR movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 @@ -5130,6 +5219,7 @@ ALIGN function_align RET ALIGN function_align .w32: + _CET_ENDBR movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 @@ -5158,6 +5248,7 @@ ALIGN function_align RET ALIGN function_align .w64: + _CET_ENDBR movu m6, [tlq+1] psubusb m7, m5, m6 psubusb m0, m6, m5 @@ -5247,6 +5338,7 @@ cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w mov hd, hm jmp wq .w4: + _CET_ENDBR mova m1, [base+filter_shuf1] sub tlq, 3 sub tlq, hq @@ -5266,6 +5358,7 @@ cglobal ipred_filter_8bpc, 3, 7, 8, dst, stride, tl, w ALIGN function_align .w8: + _CET_ENDBR movq m6, [tlq+1] ;_ _ _ 0 1 2 3 4 sub tlq, 5 sub tlq, hq @@ -5290,6 +5383,7 @@ ALIGN function_align ALIGN function_align .w16: + _CET_ENDBR movu m6, [tlq+1] ;top row sub tlq, 5 sub tlq, hq @@ -5329,6 +5423,7 @@ ALIGN function_align ALIGN function_align .w32: + _CET_ENDBR movu m6, [tlq+1] ;top row lea filterq, [tlq+17] sub tlq, 5