x86: Add 8-bit ipred z1 SSSE3 asm
diff --git a/src/x86/ipred.h b/src/x86/ipred.h
index 7df563f..a12e7f1 100644
--- a/src/x86/ipred.h
+++ b/src/x86/ipred.h
@@ -83,6 +83,9 @@
init_angular_ipred_fn(SMOOTH_PRED, ipred_smooth, ssse3);
init_angular_ipred_fn(SMOOTH_H_PRED, ipred_smooth_h, ssse3);
init_angular_ipred_fn(SMOOTH_V_PRED, ipred_smooth_v, ssse3);
+#if BITDEPTH == 8
+ init_angular_ipred_fn(Z1_PRED, ipred_z1, ssse3);
+#endif
init_angular_ipred_fn(FILTER_PRED, ipred_filter, ssse3);
init_cfl_pred_fn(DC_PRED, ipred_cfl, ssse3);
diff --git a/src/x86/ipred_sse.asm b/src/x86/ipred_sse.asm
index 9f548aa..9473351 100644
--- a/src/x86/ipred_sse.asm
+++ b/src/x86/ipred_sse.asm
@@ -55,22 +55,50 @@
18, 16, 15, 13, 12, 10, 9, 8, \
7, 6, 6, 5, 5, 4, 4, 4
-ipred_v_shuf : db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
-ipred_h_shuf : db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
-ipred_paeth_shuf : db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
-filter_shuf1 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
-filter_shuf2 : db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
-
-pw_8 : times 8 dw 8
-pb_3 : times 16 db 3
-pb_128 : times 8 db 128
-pw_128 : times 4 dw 128
-pw_255 : times 4 dw 255
-pb_2 : times 8 db 2
-pb_4 : times 8 db 4
-pb_127_m127 : times 4 db 127, -127
-pd_32768 : times 1 dd 32768
-
+ipred_v_shuf: db 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7
+ipred_h_shuf: db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
+ipred_paeth_shuf: db 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0
+filter_shuf1: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 2, 7, 2, 1, -1, 1, -1
+filter_shuf2: db 3, 4, 3, 4, 5, 6, 5, 6, 7, 11, 7, 11, 15, -1, 15, -1
+z_filter_wh4: db 7, 7, 19, 7,
+z_filter_wh8: db 19, 19, 11, 19, 11, 15, 15, 15, 23, 23, 23, 23, 39, 39, 39, 39
+pb_8: times 8 db 8
+pd_32768: dd 32768
+z_filter_wh16: db 19, 19, 19, 23, 23, 23, 31, 31, 31, 47, 47, 47, 79, 79, 79, -1
+z_filter_t_w48: db 55,127, 7,127, 15, 31, 39, 31,127, 39,127, 39, 7, 15, 31, 15
+ db 39, 63, 3, 63, 3, 3, 19, 3, 47, 19, 47, 19, 3, 3, 3, 3
+z_filter_t_w16: db 15, 31, 7, 15, 31, 7, 3, 31, 3, 3, 3, 3, 3, 3, 0, 0
+z_upsample1: db 1, 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7
+z_upsample2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8, 8
+z1_shuf_w4: db 0, 1, 1, 2, 2, 3, 3, 4, 8, 9, 9, 10, 10, 11, 11, 12
+pb_0to15: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+z_base_inc: dw 0*64, 1*64, 2*64, 3*64, 4*64, 5*64, 6*64, 7*64
+z_filter_s: db 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7
+ db 7, 8, 8, 9, 9, 10, 10, 11
+z_filter_k_tail: db 0, 64, 0, 64, 8, 56, 0, 64
+z_filter_k: times 4 db 0, 16
+ times 4 db 0, 20
+ times 4 db 8, 16
+ times 4 db 32, 16
+ times 4 db 24, 20
+ times 4 db 16, 16
+ times 4 db 0, 0
+ times 4 db 0, 0
+pw_8: times 8 db 8, 0
+pb_3: times 16 db 3
+pw_62: times 8 dw 62
+pw_64: times 8 dw 64
+pw_256: times 8 dw 256
+pw_512: times 8 dw 512
+pw_m256: times 8 dw -256
+pb_2: times 8 db 2
+pb_4: times 8 db 4
+pb_128: times 8 db 128
+pb_m16: times 8 db -16
+pw_128: times 4 dw 128
+pw_255: times 4 dw 255
+pb_36_m4: times 4 db 36, -4
+pb_127_m127: times 4 db 127, -127
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@@ -93,15 +121,16 @@
JMP_TABLE ipred_smooth_v, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_smooth_h, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_paeth, ssse3, w4, w8, w16, w32, w64
+JMP_TABLE ipred_z1, ssse3, w4, w8, w16, w32, w64
JMP_TABLE pal_pred, ssse3, w4, w8, w16, w32, w64
JMP_TABLE ipred_cfl, ssse3, h4, h8, h16, h32, w4, w8, w16, w32, \
s4-8*4, s8-8*4, s16-8*4, s32-8*4
JMP_TABLE ipred_cfl_left, ssse3, h4, h8, h16, h32
JMP_TABLE ipred_filter, ssse3, w4, w8, w16, w32
+cextern dr_intra_derivative
cextern filter_intra_taps
-
SECTION .text
;---------------------------------------------------------------------------------------
@@ -1190,6 +1219,779 @@
jg .w64_loop
RET
+%if ARCH_X86_64
+cglobal ipred_z1_8bpc, 3, 8, 11, 16*12, dst, stride, tl, w, h, angle, dx
+ %define base r7-$$
+ lea r7, [$$]
+ movifnidn hd, hm
+ mova m8, [base+pw_62]
+ mova m9, [base+pw_64]
+ mova m10, [base+pw_512]
+%else
+cglobal ipred_z1_8bpc, 3, 7, 8, -16*13, dst, stride, tl, w, h, angle, dx
+ %define base r4-$$
+ %define m8 [base+pw_62]
+ %define m9 [base+pw_64]
+ %define m10 [base+pw_512]
+ %define hd dword [rsp+16*12]
+ %define hb byte [rsp+16*12]
+ mov r3, hm
+ LEA r4, $$
+ mov hd, r3
+%endif
+ tzcnt wd, wm
+ movifnidn angled, anglem
+ inc tlq
+ movsxd wq, [base+ipred_z1_ssse3_table+wq*4]
+ lea wq, [base+wq+ipred_z1_ssse3_table]
+ mov dxd, angled
+ and dxd, 0x7e
+ add angled, 165 ; ~90
+ movzx dxd, word [base+dr_intra_derivative+dxq]
+ xor angled, 0x4ff ; d = 90 - angle
+ jmp wq
+.w4:
+%if ARCH_X86_64
+ cmp angleb, 40
+%else
+ mov r3d, angled ; rNb only valid for r0-r3 on x86-32
+ cmp r3b, 40
+%endif
+ jae .w4_no_upsample
+ lea r3d, [angleq-1024]
+ sar r3d, 7
+ add r3d, hd
+ jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
+ mova m1, [tlq-1]
+ pshufb m0, m1, [base+z_upsample1]
+ pshufb m1, [base+z_upsample2]
+ movddup m2, [base+pb_36_m4]
+ add dxd, dxd
+ pmaddubsw m0, m2
+ pshufd m7, m1, q3333
+ movd [rsp+16], m7 ; top[max_base_x]
+ pmaddubsw m1, m2
+ movd m6, dxd
+ mov r3d, dxd ; xpos
+ pshufb m6, [base+pw_256]
+ paddw m1, m0
+ movq m0, [tlq]
+ pmulhrsw m1, m10
+ paddw m7, m6, m6
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ packuswb m1, m1
+ punpcklbw m0, m1
+ mova [rsp], m0
+.w4_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movq m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movhps m0, [rsp+r2]
+ pand m2, m8, m6 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ paddw m6, m7 ; xpos += dx
+ pmulhrsw m0, m10
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_upsample_loop
+ RET
+.w4_no_upsample:
+ mov r3d, 7 ; max_base
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w4_main
+%if ARCH_X86_64
+ lea r3d, [hq+3]
+%else
+ mov r3d, hd
+ add r3d, 3
+%endif
+ movd m0, r3d
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ pcmpeqb m1, m0, [base+z_filter_wh4]
+ pand m1, m2
+ pcmpgtb m1, [base+z_filter_t_w48+angleq*8]
+ pmovmskb r5d, m1
+ mov r3d, 7
+ test r5d, r5d
+ jz .w4_main ; filter_strength == 0
+ mova m3, [tlq-1]
+ imul r5d, 0x55555555
+ movu m7, [base+z_filter_s+8]
+ shr r5d, 30 ; filter_strength
+ movddup m0, [base+pb_8]
+ pminub m7, m0
+ pshufb m0, m3, [base+z_filter_s]
+ movddup m4, [base+z_filter_k-8+r5*8+24*0]
+ pshufb m3, m7
+ movddup m5, [base+z_filter_k-8+r5*8+24*1]
+ shufps m2, m0, m3, q2121
+ movddup m6, [base+z_filter_k-8+r5*8+24*2]
+ pmaddubsw m0, m4
+ pmaddubsw m1, m2, m4
+ pmaddubsw m2, m5
+ paddd m5, m6
+ pmaddubsw m4, m3, m5
+ pmaddubsw m3, m6
+ paddw m0, m2
+ paddw m1, m4
+ paddw m0, m3
+ pshufd m1, m1, q3333
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ mov r5d, 9
+ mov tlq, rsp
+ cmp hd, 4
+ cmovne r3d, r5d
+ packuswb m0, m1
+ mova [tlq], m0
+.w4_main:
+ add tlq, r3
+ movd m5, dxd
+ movddup m0, [base+z_base_inc] ; base_inc << 6
+ movd m7, [tlq] ; top[max_base_x]
+ shl r3d, 6
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd ; xpos
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ mova m3, [base+z1_shuf_w4]
+ paddw m6, m5, m5
+ psubw m4, m0 ; max_base_x
+ punpcklqdq m5, m6 ; xpos0 xpos1
+.w4_loop:
+ lea r3, [r5+dxq]
+ sar r5, 6 ; base0
+ movq m0, [tlq+r5]
+ lea r5, [r3+dxq]
+ sar r3, 6 ; base1
+ movhps m0, [tlq+r3]
+ pand m2, m8, m5 ; frac
+ psubw m1, m9, m2 ; 64-frac
+ psllw m2, 8
+ pshufb m0, m3
+ por m1, m2 ; 64-frac, frac
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5 ; base < max_base_x
+ pmulhrsw m0, m10
+ paddw m5, m6 ; xpos += dx
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movd [dstq+strideq*0], m0
+ pshuflw m0, m0, q1032
+ movd [dstq+strideq*1], m0
+ sub hd, 2
+ jz .w4_end
+ lea dstq, [dstq+strideq*2]
+ test r5d, r5d
+ jl .w4_loop
+ packuswb m7, m7
+.w4_end_loop:
+ movd [dstq+strideq*0], m7
+ movd [dstq+strideq*1], m7
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w4_end_loop
+.w4_end:
+ RET
+.w8:
+ lea r3d, [angleq+216]
+ mov r3b, hb
+ cmp r3d, 8
+ ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
+ mova m5, [base+z_upsample1]
+ movu m3, [base+z_filter_s+6]
+ movd m4, hd
+ mova m0, [tlq-1]
+ movu m1, [tlq+7]
+ pxor m7, m7
+ pshufb m4, m7
+ movddup m7, [base+pb_36_m4]
+ pminub m4, m3
+ add dxd, dxd
+ pshufb m2, m0, m5
+ pmaddubsw m2, m7
+ pshufb m0, m3
+ pmaddubsw m0, m7
+ movd m6, dxd
+ pshufb m3, m1, m5
+ pmaddubsw m3, m7
+ pshufb m1, m4
+ pmaddubsw m1, m7
+ pshufb m6, [base+pw_256]
+ mov r3d, dxd
+ paddw m2, m0
+ paddw m7, m6, m6
+ paddw m3, m1
+ punpcklqdq m6, m7 ; xpos0 xpos1
+ movu m1, [tlq]
+ pmulhrsw m2, m10
+ pmulhrsw m3, m10
+ packuswb m2, m3
+ punpcklbw m0, m1, m2
+ punpckhbw m1, m2
+ mova [rsp+16*0], m0
+ mova [rsp+16*1], m1
+.w8_upsample_loop:
+ lea r2d, [r3+dxq]
+ shr r3d, 6 ; base0
+ movu m0, [rsp+r3]
+ lea r3d, [r2+dxq]
+ shr r2d, 6 ; base1
+ movu m1, [rsp+r2]
+ pand m2, m8, m6
+ psubw m3, m9, m2
+ psllw m2, 8
+ por m3, m2
+ punpcklqdq m2, m3, m3 ; frac0
+ pmaddubsw m0, m2
+ punpckhqdq m3, m3 ; frac1
+ pmaddubsw m1, m3
+ paddw m6, m7
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packuswb m0, m1
+ movq [dstq+strideq*0], m0
+ movhps [dstq+strideq*1], m0
+ lea dstq, [dstq+strideq*2]
+ sub hd, 2
+ jg .w8_upsample_loop
+ RET
+.w8_no_upsample:
+%if ARCH_X86_64
+ lea r3d, [hq+7]
+%else
+ mov r3d, hd
+ add r3d, 7
+%endif
+ movd m0, r3d
+ and r3d, 7
+ or r3d, 8 ; imin(h+7, 15)
+ test angled, 0x400
+ jnz .w8_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movu m1, [base+z_filter_wh8]
+ psrldq m3, [base+z_filter_t_w48+angleq*8], 4
+ pcmpeqb m1, m0
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w8_main ; filter_strength == 0
+ mova m0, [tlq- 1]
+ imul r5d, 0x55555555
+ mova m1, [tlq+15]
+ shr r5d, 30 ; filter_strength
+ movd m2, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ sub r5, 3
+ mova [tlq-16*1], m0
+ pxor m3, m3
+ mova [tlq+16*0], m1
+ pshufb m0, m3
+ pshufb m2, m3
+ mova [tlq-16*2], m0
+ movq [tlq+r3-15], m2
+ call .filter_edge
+ sar r5d, 1
+ add r5d, 17
+ cmp hd, 8
+ cmova r3d, r5d
+.w8_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ shl r3d, 6
+ movu m3, [base+z_filter_s+2]
+ movd m4, r3d
+ pshufb m5, [base+pw_256]
+ mov r5d, dxd
+ pshufb m7, [base+pw_m256]
+ sub r5, r3
+ pshufb m4, [base+pw_256]
+ psubw m4, [base+z_base_inc]
+ mova m6, m5
+.w8_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m0, [tlq+r3]
+ pand m1, m8, m5
+ psubw m2, m9, m1
+ psllw m1, 8
+ pshufb m0, m3
+ por m1, m2
+ pmaddubsw m0, m1
+ pcmpgtw m1, m4, m5
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pand m0, m1
+ pandn m1, m7
+ por m0, m1
+ packuswb m0, m0
+ movq [dstq], m0
+ dec hd
+ jz .w8_end
+ add dstq, strideq
+ add r5, dxq
+ jl .w8_loop
+ packuswb m7, m7
+.w8_end_loop:
+ movq [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w8_end_loop
+.w8_end:
+ RET
+.w16:
+%if ARCH_X86_64
+ lea r3d, [hq+15]
+%else
+ mov r3d, hd
+ add r3d, 15
+%endif
+ movd m0, r3d
+ and r3d, 15
+ or r3d, 16 ; imin(h+15, 31)
+ test angled, 0x400
+ jnz .w16_main
+ movd m2, angled
+ shr angled, 8 ; is_sm << 1
+ pxor m1, m1
+ pshufb m0, m1
+ pshufb m2, m1
+ movq m3, [base+z_filter_t_w16+angleq*4]
+ pcmpeqb m1, m0, [base+z_filter_wh16]
+ pand m1, m2
+ pcmpgtb m1, m3
+ pmovmskb r5d, m1
+ test r5d, r5d
+ jz .w16_main ; filter_strength == 0
+ mova m0, [tlq- 1]
+ imul r5d, 0x24924924
+ mova m1, [tlq+15]
+ shr r5d, 30
+ movd m2, [tlq+30]
+ adc r5, -4 ; filter_strength-3
+ movd m3, [tlq+r3]
+ lea tlq, [rsp+16*4]
+ mova [tlq-16*1], m0
+ pxor m4, m4
+ mova [tlq+16*0], m1
+ pshufb m0, m4
+ movd [rsp], m2
+ pshufb m3, m4
+ mova [tlq-16*2], m0
+ movd [tlq+r3-15], m3
+ call .filter_edge
+ cmp hd, 16
+ jle .w16_main
+ pshuflw m0, [rsp], q0000
+ sar r5, 1
+ movd m1, [base+z_filter_k_tail+4+r5*4]
+ lea r3d, [r5+33]
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+32], m0
+.w16_main:
+ add tlq, r3
+ movd m5, dxd
+ movd m7, [tlq]
+ movd m4, r3d
+ shl r3d, 6
+ pshufb m5, [base+pw_256]
+ pxor m6, m6
+ pshufb m7, m6
+ mov r5d, dxd
+ pshufb m4, m6
+ sub r5, r3
+ psubb m4, [base+pb_0to15]
+ mova m6, m5
+.w16_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m3, m5, 6
+ packsswb m3, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ paddw m5, m6
+ pcmpgtb m2, m4, m3
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq], m0
+ dec hd
+ jz .w16_end
+ add dstq, strideq
+ add r5, dxq
+ jl .w16_loop
+.w16_end_loop:
+ mova [dstq], m7
+ add dstq, strideq
+ dec hd
+ jg .w16_end_loop
+.w16_end:
+ RET
+.w32:
+%if ARCH_X86_64
+ lea r3d, [hq+31]
+%else
+ mov r3d, hd
+ add r3d, 31
+%endif
+ and r3d, 31
+ or r3d, 32 ; imin(h+31, 63)
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w32_main
+
+ mova m0, [tlq- 1]
+ mova m1, [tlq+15]
+ mova m2, [tlq+31]
+ mova m3, [tlq+47]
+ movd m4, [tlq+62]
+ movd m5, [tlq+r3]
+ lea tlq, [rsp+16*6]
+ mova [tlq-16*3], m0
+ pxor m6, m6
+ mova [tlq-16*2], m1
+ pshufb m0, m6
+ mova [tlq-16*1], m2
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq+16*0], m3
+ movd [rsp], m4
+ pshufb m5, m6
+ mova [tlq-16*4], m0
+ movd [tlq+r3-47], m5
+
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+
+ cmp hd, 32
+ jle .w32_main
+ pshuflw m0, [rsp], q0000
+ movd m1, [base+z_filter_k_tail+4]
+ add r3d, 2
+ pmaddubsw m0, m1
+%if ARCH_X86_64
+ pmulhrsw m0, m10
+%else
+ pmulhrsw m0, m4
+%endif
+ packuswb m0, m0
+ movd [tlq+64], m0
+.w32_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ pshufb m5, [base+pw_256]
+ sub r5, r3
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ mova m6, m5
+.w32_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*1], m0
+ dec hd
+ jz .w32_end
+ add dstq, strideq
+ add r5, dxq
+ jl .w32_loop
+.w32_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ add dstq, strideq
+ dec hd
+ jg .w32_end_loop
+.w32_end:
+ RET
+.w64:
+%if ARCH_X86_64
+ lea r3d, [hq+63]
+%else
+ mov r3d, hd
+ add r3d, 63
+%endif
+ test angled, 0x400 ; !enable_intra_edge_filter
+ jnz .w64_main
+ mova m0, [tlq- 1]
+ mova m1, [tlq+ 15]
+ mova m2, [tlq+ 31]
+ mova m3, [tlq+ 47]
+ mova [rsp+16*3], m0
+ pxor m5, m5
+ mova [rsp+16*4], m1
+ pshufb m0, m5
+ mova [rsp+16*5], m2
+ mova [rsp+16*6], m3
+ mova [rsp+16*2], m0
+ mova m0, [tlq+ 63]
+ mova m1, [tlq+ 79]
+ mova m2, [tlq+ 95]
+ mova m3, [tlq+111]
+ movd m4, [tlq+r3]
+ lea tlq, [rsp+16*10]
+ mova [tlq-16*3], m0
+ xor r5d, r5d ; filter_strength = 3
+ mova [tlq-16*2], m1
+ pshufb m4, m5
+ mova [tlq-16*1], m2
+ mova [tlq+16*0], m3
+ movd [tlq+r3-111], m4
+ cmp hd, 64
+ jl .w64_filter96 ; skip one call if the last 32 bytes aren't used
+ call .filter_edge
+.w64_filter96:
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+ sub tlq, 16*2
+ call .filter_edge
+.w64_main:
+ add tlq, r3
+ movd m0, r3d
+ movd m7, [tlq]
+ shl r3d, 6
+ movd m5, dxd
+ pxor m6, m6
+ mov r5d, dxd
+ pshufb m0, m6
+ sub r5, r3
+ pshufb m5, [base+pw_256]
+ pshufb m7, m6
+ psubb m0, [base+pb_0to15]
+ movddup m1, [base+pb_m16]
+ mova [rsp+16*0], m0
+ paddb m0, m1
+ mova [rsp+16*1], m0
+ paddb m0, m1
+ mova [rsp+16*2], m0
+ paddb m0, m1
+ mova [rsp+16*3], m0
+ mova m6, m5
+.w64_loop:
+ mov r3, r5
+ sar r3, 6
+ movu m1, [tlq+r3+16*0+0]
+ pand m0, m8, m5
+ movu m2, [tlq+r3+16*0+1]
+ psubw m3, m9, m0
+ psllw m0, 8
+ por m3, m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ psrlw m4, m5, 6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ packsswb m4, m4
+ pcmpgtb m2, [rsp+16*0], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*1+0]
+ movu m2, [tlq+r3+16*1+1]
+ mova [dstq+16*0], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*1], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*2+0]
+ movu m2, [tlq+r3+16*2+1]
+ mova [dstq+16*1], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*2], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ movu m1, [tlq+r3+16*3+0]
+ movu m2, [tlq+r3+16*3+1]
+ mova [dstq+16*2], m0
+ punpcklbw m0, m1, m2
+ pmaddubsw m0, m3
+ punpckhbw m1, m2
+ pmaddubsw m1, m3
+ paddw m5, m6
+ pmulhrsw m0, m10
+ pmulhrsw m1, m10
+ pcmpgtb m2, [rsp+16*3], m4
+ packuswb m0, m1
+ pand m0, m2
+ pandn m2, m7
+ por m0, m2
+ mova [dstq+16*3], m0
+ dec hd
+ jz .w64_end
+ add dstq, strideq
+ add r5, dxq
+ jl .w64_loop
+.w64_end_loop:
+ mova [dstq+16*0], m7
+ mova [dstq+16*1], m7
+ mova [dstq+16*2], m7
+ mova [dstq+16*3], m7
+ add dstq, strideq
+ dec hd
+ jg .w64_end_loop
+.w64_end:
+ RET
+ALIGN function_align
+.filter_edge: ; 32 pixels/iteration
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*0]
+ movu m2, [tlq-17]
+ mova m1, [tlq-16]
+ movu m3, [tlq- 1]
+ mova m4, [tlq+ 0]
+ punpcklbw m0, m2, m1
+ pmaddubsw m0, m7
+ punpckhbw m2, m1
+ pmaddubsw m2, m7
+ punpcklbw m1, m3, m4
+ pmaddubsw m1, m7
+ punpckhbw m3, m4
+ pmaddubsw m3, m7
+ movddup m7, [base+z_filter_k+8*2+r5*8+24*1]
+ movu m5, [tlq-15]
+ movu m6, [tlq-14]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ movu m5, [tlq+ 1]
+ movu m6, [tlq+ 2]
+ punpcklbw m4, m5, m6
+ pmaddubsw m4, m7
+ punpckhbw m5, m6
+ pmaddubsw m5, m7
+ paddw m1, m4
+ paddw m3, m5
+ test r5d, r5d
+ jnz .filter_end ; 3-tap
+ movddup m7, [base+z_filter_k+8*8]
+ movu m5, [tlq-13]
+ movu m6, [tlq+ 3]
+ punpcklbw m4, m5, m5
+ pmaddubsw m4, m7
+ punpckhbw m5, m5
+ pmaddubsw m5, m7
+ paddw m0, m4
+ paddw m2, m5
+ punpcklbw m5, m6, m6
+ pmaddubsw m5, m7
+ punpckhbw m6, m6
+ pmaddubsw m6, m7
+ paddw m1, m5
+ paddw m3, m6
+.filter_end:
+%if ARCH_X86_64
+ REPX {pmulhrsw x, m10}, m0, m2, m1, m3
+%else
+ mova m4, m10
+ REPX {pmulhrsw x, m4 }, m0, m2, m1, m3
+%endif
+ packuswb m0, m2
+ packuswb m1, m3
+ mova [tlq+16*0], m0
+ mova [tlq+16*1], m1
+ ret
+
;---------------------------------------------------------------------------------------
;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
; const uint8_t *idx, const int w, const int h);