1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
11
12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>)
13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>)
14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>)
15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>)
17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
19
20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
25
26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>)
27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>)
28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>)
29
30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind {
31; CHECK-LABEL: umulo_v1i32:
32; CHECK:       # %bb.0:
33; CHECK-NEXT:    movq %rdx, %rcx
34; CHECK-NEXT:    movl %edi, %eax
35; CHECK-NEXT:    xorl %edi, %edi
36; CHECK-NEXT:    mull %esi
37; CHECK-NEXT:    seto %dil
38; CHECK-NEXT:    negl %edi
39; CHECK-NEXT:    movl %eax, (%rcx)
40; CHECK-NEXT:    movl %edi, %eax
41; CHECK-NEXT:    retq
42  %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
43  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
44  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
45  %res = sext <1 x i1> %obit to <1 x i32>
46  store <1 x i32> %val, ptr %p2
47  ret <1 x i32> %res
48}
49
50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind {
51; SSE2-LABEL: umulo_v2i32:
52; SSE2:       # %bb.0:
53; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
54; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
55; SSE2-NEXT:    pmuludq %xmm1, %xmm2
56; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
57; SSE2-NEXT:    pxor %xmm1, %xmm1
58; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
59; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
60; SSE2-NEXT:    pxor %xmm1, %xmm0
61; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
62; SSE2-NEXT:    movq %xmm1, (%rdi)
63; SSE2-NEXT:    retq
64;
65; SSSE3-LABEL: umulo_v2i32:
66; SSSE3:       # %bb.0:
67; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
68; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
69; SSSE3-NEXT:    pmuludq %xmm1, %xmm2
70; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
71; SSSE3-NEXT:    pxor %xmm1, %xmm1
72; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
73; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
74; SSSE3-NEXT:    pxor %xmm1, %xmm0
75; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
76; SSSE3-NEXT:    movq %xmm1, (%rdi)
77; SSSE3-NEXT:    retq
78;
79; SSE41-LABEL: umulo_v2i32:
80; SSE41:       # %bb.0:
81; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
82; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
83; SSE41-NEXT:    pmuludq %xmm1, %xmm2
84; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3]
85; SSE41-NEXT:    pxor %xmm1, %xmm1
86; SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
87; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
88; SSE41-NEXT:    pxor %xmm1, %xmm0
89; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
90; SSE41-NEXT:    movq %xmm1, (%rdi)
91; SSE41-NEXT:    retq
92;
93; AVX-LABEL: umulo_v2i32:
94; AVX:       # %bb.0:
95; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
96; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
97; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm1
98; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
99; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
100; AVX-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
101; AVX-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
102; AVX-NEXT:    vpxor %xmm2, %xmm0, %xmm0
103; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
104; AVX-NEXT:    vmovq %xmm1, (%rdi)
105; AVX-NEXT:    retq
106;
107; AVX512-LABEL: umulo_v2i32:
108; AVX512:       # %bb.0:
109; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
110; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
111; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
112; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
113; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
114; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
115; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm0, %xmm0
116; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
117; AVX512-NEXT:    vmovq %xmm1, (%rdi)
118; AVX512-NEXT:    retq
119  %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
120  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
121  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
122  %res = sext <2 x i1> %obit to <2 x i32>
123  store <2 x i32> %val, ptr %p2
124  ret <2 x i32> %res
125}
126
127define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind {
128; SSE2-LABEL: umulo_v3i32:
129; SSE2:       # %bb.0:
130; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
131; SSE2-NEXT:    pmuludq %xmm1, %xmm0
132; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
133; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
134; SSE2-NEXT:    pmuludq %xmm2, %xmm4
135; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
136; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
137; SSE2-NEXT:    pxor %xmm2, %xmm2
138; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
139; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
140; SSE2-NEXT:    pxor %xmm2, %xmm1
141; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
142; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
143; SSE2-NEXT:    movd %xmm2, 8(%rdi)
144; SSE2-NEXT:    movq %xmm0, (%rdi)
145; SSE2-NEXT:    movdqa %xmm1, %xmm0
146; SSE2-NEXT:    retq
147;
148; SSSE3-LABEL: umulo_v3i32:
149; SSSE3:       # %bb.0:
150; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
151; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
152; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
153; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
154; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
155; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
156; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
157; SSSE3-NEXT:    pxor %xmm2, %xmm2
158; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
159; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
160; SSSE3-NEXT:    pxor %xmm2, %xmm1
161; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
162; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
163; SSSE3-NEXT:    movd %xmm2, 8(%rdi)
164; SSSE3-NEXT:    movq %xmm0, (%rdi)
165; SSSE3-NEXT:    movdqa %xmm1, %xmm0
166; SSSE3-NEXT:    retq
167;
168; SSE41-LABEL: umulo_v3i32:
169; SSE41:       # %bb.0:
170; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
171; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
172; SSE41-NEXT:    pmuludq %xmm2, %xmm3
173; SSE41-NEXT:    movdqa %xmm0, %xmm2
174; SSE41-NEXT:    pmuludq %xmm1, %xmm2
175; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
176; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
177; SSE41-NEXT:    pxor %xmm3, %xmm3
178; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
179; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
180; SSE41-NEXT:    pxor %xmm3, %xmm2
181; SSE41-NEXT:    pmulld %xmm1, %xmm0
182; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
183; SSE41-NEXT:    movq %xmm0, (%rdi)
184; SSE41-NEXT:    movdqa %xmm2, %xmm0
185; SSE41-NEXT:    retq
186;
187; AVX1-LABEL: umulo_v3i32:
188; AVX1:       # %bb.0:
189; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
190; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
191; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
192; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
193; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
194; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
195; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
196; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
197; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
198; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
199; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
200; AVX1-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
201; AVX1-NEXT:    vmovq %xmm0, (%rdi)
202; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
203; AVX1-NEXT:    retq
204;
205; AVX2-LABEL: umulo_v3i32:
206; AVX2:       # %bb.0:
207; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
208; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
209; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
210; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
211; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
212; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
213; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
214; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
215; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
216; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
217; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
218; AVX2-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
219; AVX2-NEXT:    vmovq %xmm0, (%rdi)
220; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
221; AVX2-NEXT:    retq
222;
223; AVX512-LABEL: umulo_v3i32:
224; AVX512:       # %bb.0:
225; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
226; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
227; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
228; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
229; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
230; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
231; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
232; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
233; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
234; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
235; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
236; AVX512-NEXT:    vmovq %xmm1, (%rdi)
237; AVX512-NEXT:    retq
238  %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
239  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
240  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
241  %res = sext <3 x i1> %obit to <3 x i32>
242  store <3 x i32> %val, ptr %p2
243  ret <3 x i32> %res
244}
245
246define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind {
247; SSE2-LABEL: umulo_v4i32:
248; SSE2:       # %bb.0:
249; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
250; SSE2-NEXT:    pmuludq %xmm1, %xmm0
251; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
252; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
253; SSE2-NEXT:    pmuludq %xmm2, %xmm4
254; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
255; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
256; SSE2-NEXT:    pxor %xmm2, %xmm2
257; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
258; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
259; SSE2-NEXT:    pxor %xmm2, %xmm1
260; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
261; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
262; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
263; SSE2-NEXT:    movdqa %xmm0, (%rdi)
264; SSE2-NEXT:    movdqa %xmm1, %xmm0
265; SSE2-NEXT:    retq
266;
267; SSSE3-LABEL: umulo_v4i32:
268; SSSE3:       # %bb.0:
269; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
270; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
271; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
272; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
273; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
274; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
275; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
276; SSSE3-NEXT:    pxor %xmm2, %xmm2
277; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
278; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
279; SSSE3-NEXT:    pxor %xmm2, %xmm1
280; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
281; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
282; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
283; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
284; SSSE3-NEXT:    movdqa %xmm1, %xmm0
285; SSSE3-NEXT:    retq
286;
287; SSE41-LABEL: umulo_v4i32:
288; SSE41:       # %bb.0:
289; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
290; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
291; SSE41-NEXT:    pmuludq %xmm2, %xmm3
292; SSE41-NEXT:    movdqa %xmm0, %xmm2
293; SSE41-NEXT:    pmuludq %xmm1, %xmm2
294; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
295; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
296; SSE41-NEXT:    pxor %xmm3, %xmm3
297; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
298; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
299; SSE41-NEXT:    pxor %xmm3, %xmm2
300; SSE41-NEXT:    pmulld %xmm1, %xmm0
301; SSE41-NEXT:    movdqa %xmm0, (%rdi)
302; SSE41-NEXT:    movdqa %xmm2, %xmm0
303; SSE41-NEXT:    retq
304;
305; AVX1-LABEL: umulo_v4i32:
306; AVX1:       # %bb.0:
307; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
308; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
309; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
310; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
311; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
312; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
313; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
314; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
315; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
316; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
317; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
318; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
319; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
320; AVX1-NEXT:    retq
321;
322; AVX2-LABEL: umulo_v4i32:
323; AVX2:       # %bb.0:
324; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
325; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
326; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
327; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
328; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
329; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
330; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
331; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
332; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
333; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
334; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
335; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
336; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
337; AVX2-NEXT:    retq
338;
339; AVX512-LABEL: umulo_v4i32:
340; AVX512:       # %bb.0:
341; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
342; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
343; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
344; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
345; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
346; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
347; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
348; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
349; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
350; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
351; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
352; AVX512-NEXT:    retq
353  %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
354  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
355  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
356  %res = sext <4 x i1> %obit to <4 x i32>
357  store <4 x i32> %val, ptr %p2
358  ret <4 x i32> %res
359}
360
361define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
362; SSE2-LABEL: umulo_v6i32:
363; SSE2:       # %bb.0:
364; SSE2-NEXT:    movq %rdi, %rax
365; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
366; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
367; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
368; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
369; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
370; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
371; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
372; SSE2-NEXT:    movd %r8d, %xmm0
373; SSE2-NEXT:    movd %ecx, %xmm1
374; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
375; SSE2-NEXT:    movd %edx, %xmm0
376; SSE2-NEXT:    movd %esi, %xmm3
377; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
378; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
379; SSE2-NEXT:    movd %r9d, %xmm1
380; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
381; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
382; SSE2-NEXT:    pmuludq %xmm1, %xmm0
383; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
384; SSE2-NEXT:    pmuludq %xmm2, %xmm3
385; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
386; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
387; SSE2-NEXT:    pmuludq %xmm4, %xmm2
388; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
389; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
390; SSE2-NEXT:    pxor %xmm4, %xmm4
391; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
392; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
393; SSE2-NEXT:    pxor %xmm5, %xmm1
394; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
395; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
396; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
397; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
398; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
399; SSE2-NEXT:    pmuludq %xmm2, %xmm6
400; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
401; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
402; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
403; SSE2-NEXT:    pcmpeqd %xmm4, %xmm7
404; SSE2-NEXT:    pxor %xmm5, %xmm7
405; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
406; SSE2-NEXT:    movq %xmm0, 16(%rcx)
407; SSE2-NEXT:    movdqa %xmm3, (%rcx)
408; SSE2-NEXT:    movq %xmm7, 16(%rdi)
409; SSE2-NEXT:    movdqa %xmm1, (%rdi)
410; SSE2-NEXT:    retq
411;
412; SSSE3-LABEL: umulo_v6i32:
413; SSSE3:       # %bb.0:
414; SSSE3-NEXT:    movq %rdi, %rax
415; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
416; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
417; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
418; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
419; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
420; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
421; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
422; SSSE3-NEXT:    movd %r8d, %xmm0
423; SSSE3-NEXT:    movd %ecx, %xmm1
424; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
425; SSSE3-NEXT:    movd %edx, %xmm0
426; SSSE3-NEXT:    movd %esi, %xmm3
427; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
428; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
429; SSSE3-NEXT:    movd %r9d, %xmm1
430; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
431; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
432; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
433; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
434; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
435; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
436; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
437; SSSE3-NEXT:    pmuludq %xmm4, %xmm2
438; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
439; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
440; SSSE3-NEXT:    pxor %xmm4, %xmm4
441; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
442; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
443; SSSE3-NEXT:    pxor %xmm5, %xmm1
444; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
445; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
446; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
447; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
448; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
449; SSSE3-NEXT:    pmuludq %xmm2, %xmm6
450; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
451; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
452; SSSE3-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
453; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm7
454; SSSE3-NEXT:    pxor %xmm5, %xmm7
455; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
456; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
457; SSSE3-NEXT:    movdqa %xmm3, (%rcx)
458; SSSE3-NEXT:    movq %xmm7, 16(%rdi)
459; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
460; SSSE3-NEXT:    retq
461;
462; SSE41-LABEL: umulo_v6i32:
463; SSE41:       # %bb.0:
464; SSE41-NEXT:    movq %rdi, %rax
465; SSE41-NEXT:    movd %esi, %xmm2
466; SSE41-NEXT:    pinsrd $1, %edx, %xmm2
467; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
468; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
469; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
470; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
471; SSE41-NEXT:    movdqa %xmm1, %xmm0
472; SSE41-NEXT:    pmuludq %xmm2, %xmm1
473; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
474; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
475; SSE41-NEXT:    movd %r9d, %xmm4
476; SSE41-NEXT:    movdqa %xmm4, %xmm5
477; SSE41-NEXT:    pmuludq %xmm3, %xmm4
478; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
479; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm5
480; SSE41-NEXT:    pmulld %xmm3, %xmm5
481; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm0
482; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
483; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
484; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
485; SSE41-NEXT:    pmuludq %xmm3, %xmm6
486; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
487; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
488; SSE41-NEXT:    pxor %xmm8, %xmm8
489; SSE41-NEXT:    pcmpeqd %xmm8, %xmm1
490; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
491; SSE41-NEXT:    pxor %xmm6, %xmm1
492; SSE41-NEXT:    movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
493; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
494; SSE41-NEXT:    pmuludq %xmm7, %xmm3
495; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
496; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
497; SSE41-NEXT:    pcmpeqd %xmm8, %xmm4
498; SSE41-NEXT:    pxor %xmm6, %xmm4
499; SSE41-NEXT:    pmulld %xmm2, %xmm0
500; SSE41-NEXT:    movq %xmm5, 16(%rcx)
501; SSE41-NEXT:    movdqa %xmm0, (%rcx)
502; SSE41-NEXT:    movq %xmm4, 16(%rdi)
503; SSE41-NEXT:    movdqa %xmm1, (%rdi)
504; SSE41-NEXT:    retq
505;
506; AVX1-LABEL: umulo_v6i32:
507; AVX1:       # %bb.0:
508; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
509; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
510; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
511; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
512; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
513; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
514; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
515; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
516; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
517; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm8, %xmm2
518; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
519; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
520; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
521; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
522; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
523; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm7
524; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
525; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
526; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
527; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
528; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
529; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
530; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
531; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
532; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
533; AVX1-NEXT:    vmovaps %ymm2, %ymm0
534; AVX1-NEXT:    retq
535;
536; AVX2-LABEL: umulo_v6i32:
537; AVX2:       # %bb.0:
538; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
539; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
540; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
541; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
542; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
543; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
544; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
545; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
546; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
547; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
548; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
549; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
550; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
551; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
552; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
553; AVX2-NEXT:    retq
554;
555; AVX512-LABEL: umulo_v6i32:
556; AVX512:       # %bb.0:
557; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
558; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
559; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
560; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
561; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
562; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
563; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
564; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
565; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
566; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
567; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
568; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
569; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
570; AVX512-NEXT:    retq
571  %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
572  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
573  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
574  %res = sext <6 x i1> %obit to <6 x i32>
575  store <6 x i32> %val, ptr %p2
576  ret <6 x i32> %res
577}
578
579define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
580; SSE2-LABEL: umulo_v8i32:
581; SSE2:       # %bb.0:
582; SSE2-NEXT:    movdqa %xmm0, %xmm4
583; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
584; SSE2-NEXT:    pmuludq %xmm2, %xmm4
585; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
586; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
587; SSE2-NEXT:    pmuludq %xmm5, %xmm6
588; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
589; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
590; SSE2-NEXT:    pxor %xmm8, %xmm8
591; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
592; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
593; SSE2-NEXT:    pxor %xmm7, %xmm0
594; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
595; SSE2-NEXT:    pmuludq %xmm3, %xmm1
596; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
597; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
598; SSE2-NEXT:    pmuludq %xmm5, %xmm3
599; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
600; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
601; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
602; SSE2-NEXT:    pxor %xmm7, %xmm2
603; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
604; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
605; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
606; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
607; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
608; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
609; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
610; SSE2-NEXT:    movdqa %xmm4, (%rdi)
611; SSE2-NEXT:    movdqa %xmm2, %xmm1
612; SSE2-NEXT:    retq
613;
614; SSSE3-LABEL: umulo_v8i32:
615; SSSE3:       # %bb.0:
616; SSSE3-NEXT:    movdqa %xmm0, %xmm4
617; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
618; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
619; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
620; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
621; SSSE3-NEXT:    pmuludq %xmm5, %xmm6
622; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
623; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
624; SSSE3-NEXT:    pxor %xmm8, %xmm8
625; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm0
626; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
627; SSSE3-NEXT:    pxor %xmm7, %xmm0
628; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
629; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
630; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
631; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
632; SSSE3-NEXT:    pmuludq %xmm5, %xmm3
633; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
634; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
635; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
636; SSSE3-NEXT:    pxor %xmm7, %xmm2
637; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
638; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
639; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
640; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
641; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
642; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
643; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
644; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
645; SSSE3-NEXT:    movdqa %xmm2, %xmm1
646; SSSE3-NEXT:    retq
647;
648; SSE41-LABEL: umulo_v8i32:
649; SSE41:       # %bb.0:
650; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
651; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
652; SSE41-NEXT:    pmuludq %xmm4, %xmm5
653; SSE41-NEXT:    movdqa %xmm0, %xmm4
654; SSE41-NEXT:    pmuludq %xmm2, %xmm4
655; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
656; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
657; SSE41-NEXT:    pxor %xmm8, %xmm8
658; SSE41-NEXT:    pcmpeqd %xmm8, %xmm4
659; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
660; SSE41-NEXT:    pxor %xmm7, %xmm4
661; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
662; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
663; SSE41-NEXT:    pmuludq %xmm5, %xmm6
664; SSE41-NEXT:    movdqa %xmm1, %xmm5
665; SSE41-NEXT:    pmuludq %xmm3, %xmm5
666; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
667; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
668; SSE41-NEXT:    pcmpeqd %xmm8, %xmm5
669; SSE41-NEXT:    pxor %xmm7, %xmm5
670; SSE41-NEXT:    pmulld %xmm2, %xmm0
671; SSE41-NEXT:    pmulld %xmm3, %xmm1
672; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
673; SSE41-NEXT:    movdqa %xmm0, (%rdi)
674; SSE41-NEXT:    movdqa %xmm4, %xmm0
675; SSE41-NEXT:    movdqa %xmm5, %xmm1
676; SSE41-NEXT:    retq
677;
678; AVX1-LABEL: umulo_v8i32:
679; AVX1:       # %bb.0:
680; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
681; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
682; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
683; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
684; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
685; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
686; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
687; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
688; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
689; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm8, %xmm2
690; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
691; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
692; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
693; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
694; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
695; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm7
696; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
697; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
698; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
699; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
700; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
701; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
702; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
703; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
704; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
705; AVX1-NEXT:    vmovaps %ymm2, %ymm0
706; AVX1-NEXT:    retq
707;
708; AVX2-LABEL: umulo_v8i32:
709; AVX2:       # %bb.0:
710; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
711; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
712; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
713; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
714; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
715; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
716; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
717; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
718; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
719; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
720; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
721; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
722; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
723; AVX2-NEXT:    retq
724;
725; AVX512-LABEL: umulo_v8i32:
726; AVX512:       # %bb.0:
727; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
728; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
729; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
730; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
731; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
732; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
733; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
734; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
735; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
736; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
737; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
738; AVX512-NEXT:    retq
739  %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
740  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
741  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
742  %res = sext <8 x i1> %obit to <8 x i32>
743  store <8 x i32> %val, ptr %p2
744  ret <8 x i32> %res
745}
746
747define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind {
748; SSE2-LABEL: umulo_v16i32:
749; SSE2:       # %bb.0:
750; SSE2-NEXT:    movdqa %xmm0, %xmm8
751; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
752; SSE2-NEXT:    pmuludq %xmm4, %xmm8
753; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
754; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
755; SSE2-NEXT:    pmuludq %xmm10, %xmm9
756; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
757; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
758; SSE2-NEXT:    pxor %xmm10, %xmm10
759; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
760; SSE2-NEXT:    pcmpeqd %xmm11, %xmm11
761; SSE2-NEXT:    pxor %xmm11, %xmm0
762; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
763; SSE2-NEXT:    pmuludq %xmm5, %xmm1
764; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
765; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
766; SSE2-NEXT:    pmuludq %xmm13, %xmm12
767; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
768; SSE2-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
769; SSE2-NEXT:    pcmpeqd %xmm10, %xmm15
770; SSE2-NEXT:    pxor %xmm11, %xmm15
771; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
772; SSE2-NEXT:    pmuludq %xmm6, %xmm2
773; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
774; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
775; SSE2-NEXT:    pmuludq %xmm14, %xmm13
776; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
777; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
778; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
779; SSE2-NEXT:    pxor %xmm11, %xmm5
780; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
781; SSE2-NEXT:    pmuludq %xmm7, %xmm3
782; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
783; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
784; SSE2-NEXT:    pmuludq %xmm14, %xmm7
785; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
786; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
787; SSE2-NEXT:    pcmpeqd %xmm10, %xmm6
788; SSE2-NEXT:    pxor %xmm11, %xmm6
789; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
790; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
791; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
792; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
793; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
794; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
795; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
796; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
797; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
798; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
799; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
800; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
801; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
802; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
803; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
804; SSE2-NEXT:    movdqa %xmm8, (%rdi)
805; SSE2-NEXT:    movdqa %xmm15, %xmm1
806; SSE2-NEXT:    movdqa %xmm5, %xmm2
807; SSE2-NEXT:    movdqa %xmm6, %xmm3
808; SSE2-NEXT:    retq
809;
810; SSSE3-LABEL: umulo_v16i32:
811; SSSE3:       # %bb.0:
812; SSSE3-NEXT:    movdqa %xmm0, %xmm8
813; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
814; SSSE3-NEXT:    pmuludq %xmm4, %xmm8
815; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
816; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
817; SSSE3-NEXT:    pmuludq %xmm10, %xmm9
818; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
819; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
820; SSSE3-NEXT:    pxor %xmm10, %xmm10
821; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
822; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm11
823; SSSE3-NEXT:    pxor %xmm11, %xmm0
824; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
825; SSSE3-NEXT:    pmuludq %xmm5, %xmm1
826; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
827; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
828; SSSE3-NEXT:    pmuludq %xmm13, %xmm12
829; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
830; SSSE3-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
831; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm15
832; SSSE3-NEXT:    pxor %xmm11, %xmm15
833; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
834; SSSE3-NEXT:    pmuludq %xmm6, %xmm2
835; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
836; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
837; SSSE3-NEXT:    pmuludq %xmm14, %xmm13
838; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
839; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
840; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
841; SSSE3-NEXT:    pxor %xmm11, %xmm5
842; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
843; SSSE3-NEXT:    pmuludq %xmm7, %xmm3
844; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
845; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
846; SSSE3-NEXT:    pmuludq %xmm14, %xmm7
847; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
848; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
849; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm6
850; SSSE3-NEXT:    pxor %xmm11, %xmm6
851; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
852; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
853; SSSE3-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
854; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
855; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
856; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
857; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
858; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
859; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
860; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
861; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
862; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
863; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
864; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
865; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
866; SSSE3-NEXT:    movdqa %xmm8, (%rdi)
867; SSSE3-NEXT:    movdqa %xmm15, %xmm1
868; SSSE3-NEXT:    movdqa %xmm5, %xmm2
869; SSSE3-NEXT:    movdqa %xmm6, %xmm3
870; SSSE3-NEXT:    retq
871;
872; SSE41-LABEL: umulo_v16i32:
873; SSE41:       # %bb.0:
874; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
875; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
876; SSE41-NEXT:    pmuludq %xmm8, %xmm9
877; SSE41-NEXT:    movdqa %xmm0, %xmm8
878; SSE41-NEXT:    pmuludq %xmm4, %xmm8
879; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
880; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7]
881; SSE41-NEXT:    pxor %xmm12, %xmm12
882; SSE41-NEXT:    pcmpeqd %xmm12, %xmm8
883; SSE41-NEXT:    pcmpeqd %xmm13, %xmm13
884; SSE41-NEXT:    pxor %xmm13, %xmm8
885; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
886; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
887; SSE41-NEXT:    pmuludq %xmm9, %xmm10
888; SSE41-NEXT:    movdqa %xmm1, %xmm9
889; SSE41-NEXT:    pmuludq %xmm5, %xmm9
890; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
891; SSE41-NEXT:    pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7]
892; SSE41-NEXT:    pcmpeqd %xmm12, %xmm9
893; SSE41-NEXT:    pxor %xmm13, %xmm9
894; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
895; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
896; SSE41-NEXT:    pmuludq %xmm10, %xmm11
897; SSE41-NEXT:    movdqa %xmm2, %xmm10
898; SSE41-NEXT:    pmuludq %xmm6, %xmm10
899; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
900; SSE41-NEXT:    pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
901; SSE41-NEXT:    pcmpeqd %xmm12, %xmm10
902; SSE41-NEXT:    pxor %xmm13, %xmm10
903; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3]
904; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
905; SSE41-NEXT:    pmuludq %xmm11, %xmm14
906; SSE41-NEXT:    movdqa %xmm3, %xmm11
907; SSE41-NEXT:    pmuludq %xmm7, %xmm11
908; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
909; SSE41-NEXT:    pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7]
910; SSE41-NEXT:    pcmpeqd %xmm12, %xmm11
911; SSE41-NEXT:    pxor %xmm13, %xmm11
912; SSE41-NEXT:    pmulld %xmm4, %xmm0
913; SSE41-NEXT:    pmulld %xmm5, %xmm1
914; SSE41-NEXT:    pmulld %xmm6, %xmm2
915; SSE41-NEXT:    pmulld %xmm7, %xmm3
916; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
917; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
918; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
919; SSE41-NEXT:    movdqa %xmm0, (%rdi)
920; SSE41-NEXT:    movdqa %xmm8, %xmm0
921; SSE41-NEXT:    movdqa %xmm9, %xmm1
922; SSE41-NEXT:    movdqa %xmm10, %xmm2
923; SSE41-NEXT:    movdqa %xmm11, %xmm3
924; SSE41-NEXT:    retq
925;
926; AVX1-LABEL: umulo_v16i32:
927; AVX1:       # %bb.0:
928; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
929; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3]
930; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
931; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3]
932; AVX1-NEXT:    vpmuludq %xmm6, %xmm7, %xmm6
933; AVX1-NEXT:    vpmuludq %xmm10, %xmm12, %xmm7
934; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
935; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
936; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
937; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm8, %xmm7
938; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm9, %xmm9
939; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
940; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
941; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
942; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
943; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm6
944; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
945; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7]
946; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm8, %xmm4
947; AVX1-NEXT:    vpxor %xmm4, %xmm9, %xmm4
948; AVX1-NEXT:    vpackssdw %xmm7, %xmm4, %xmm11
949; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
950; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
951; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
952; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
953; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
954; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm7
955; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
956; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
957; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
958; AVX1-NEXT:    vpxor %xmm5, %xmm9, %xmm13
959; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
960; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
961; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
962; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm7
963; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
964; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
965; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
966; AVX1-NEXT:    vpxor %xmm5, %xmm9, %xmm5
967; AVX1-NEXT:    vpackssdw %xmm13, %xmm5, %xmm5
968; AVX1-NEXT:    vpacksswb %xmm11, %xmm5, %xmm5
969; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm2
970; AVX1-NEXT:    vpmulld %xmm6, %xmm4, %xmm4
971; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm3
972; AVX1-NEXT:    vpmulld %xmm10, %xmm12, %xmm6
973; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm0
974; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
975; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
976; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
977; AVX1-NEXT:    vpacksswb %xmm11, %xmm11, %xmm1
978; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
979; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
980; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
981; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
982; AVX1-NEXT:    vmovdqa %xmm6, 48(%rdi)
983; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
984; AVX1-NEXT:    vmovdqa %xmm4, 16(%rdi)
985; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
986; AVX1-NEXT:    retq
987;
988; AVX2-LABEL: umulo_v16i32:
989; AVX2:       # %bb.0:
990; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
991; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
992; AVX2-NEXT:    vpmuludq %ymm4, %ymm5, %ymm4
993; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm5
994; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7]
995; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7]
996; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
997; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm4, %ymm4
998; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
999; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
1000; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
1001; AVX2-NEXT:    vpackssdw %xmm7, %xmm4, %xmm4
1002; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7]
1003; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7]
1004; AVX2-NEXT:    vpmuludq %ymm7, %ymm8, %ymm7
1005; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm8
1006; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7]
1007; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
1008; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
1009; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
1010; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1011; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
1012; AVX2-NEXT:    vpacksswb %xmm5, %xmm5, %xmm5
1013; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm2
1014; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm3
1015; AVX2-NEXT:    vpmovsxbd %xmm5, %ymm0
1016; AVX2-NEXT:    vpacksswb %xmm4, %xmm4, %xmm1
1017; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1018; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
1019; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
1020; AVX2-NEXT:    retq
1021;
1022; AVX512-LABEL: umulo_v16i32:
1023; AVX512:       # %bb.0:
1024; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2
1025; AVX512-NEXT:    vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1026; AVX512-NEXT:    vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1027; AVX512-NEXT:    vpmuludq %zmm3, %zmm4, %zmm3
1028; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
1029; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1030; AVX512-NEXT:    vptestmd %zmm4, %zmm4, %k1
1031; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm1
1032; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1033; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
1034; AVX512-NEXT:    retq
1035  %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
1036  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
1037  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
1038  %res = sext <16 x i1> %obit to <16 x i32>
1039  store <16 x i32> %val, ptr %p2
1040  ret <16 x i32> %res
1041}
1042
1043define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
1044; SSE2-LABEL: umulo_v16i8:
1045; SSE2:       # %bb.0:
1046; SSE2-NEXT:    pxor %xmm2, %xmm2
1047; SSE2-NEXT:    movdqa %xmm1, %xmm3
1048; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1049; SSE2-NEXT:    movdqa %xmm0, %xmm5
1050; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1051; SSE2-NEXT:    pmullw %xmm3, %xmm5
1052; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1053; SSE2-NEXT:    movdqa %xmm5, %xmm3
1054; SSE2-NEXT:    pand %xmm4, %xmm3
1055; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1056; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1057; SSE2-NEXT:    pmullw %xmm1, %xmm0
1058; SSE2-NEXT:    pand %xmm0, %xmm4
1059; SSE2-NEXT:    packuswb %xmm3, %xmm4
1060; SSE2-NEXT:    psrlw $8, %xmm5
1061; SSE2-NEXT:    psrlw $8, %xmm0
1062; SSE2-NEXT:    packuswb %xmm5, %xmm0
1063; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
1064; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1065; SSE2-NEXT:    pxor %xmm2, %xmm3
1066; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1067; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1068; SSE2-NEXT:    psrad $24, %xmm0
1069; SSE2-NEXT:    movdqa %xmm3, %xmm1
1070; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1071; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1072; SSE2-NEXT:    pslld $31, %xmm1
1073; SSE2-NEXT:    psrad $31, %xmm1
1074; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1075; SSE2-NEXT:    movdqa %xmm3, %xmm2
1076; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1077; SSE2-NEXT:    pslld $31, %xmm2
1078; SSE2-NEXT:    psrad $31, %xmm2
1079; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1080; SSE2-NEXT:    pslld $31, %xmm3
1081; SSE2-NEXT:    psrad $31, %xmm3
1082; SSE2-NEXT:    movdqa %xmm4, (%rdi)
1083; SSE2-NEXT:    retq
1084;
1085; SSSE3-LABEL: umulo_v16i8:
1086; SSSE3:       # %bb.0:
1087; SSSE3-NEXT:    pxor %xmm2, %xmm2
1088; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1089; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1090; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1091; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1092; SSSE3-NEXT:    pmullw %xmm3, %xmm5
1093; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1094; SSSE3-NEXT:    movdqa %xmm5, %xmm3
1095; SSSE3-NEXT:    pand %xmm4, %xmm3
1096; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1097; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1098; SSSE3-NEXT:    pmullw %xmm1, %xmm0
1099; SSSE3-NEXT:    pand %xmm0, %xmm4
1100; SSSE3-NEXT:    packuswb %xmm3, %xmm4
1101; SSSE3-NEXT:    psrlw $8, %xmm5
1102; SSSE3-NEXT:    psrlw $8, %xmm0
1103; SSSE3-NEXT:    packuswb %xmm5, %xmm0
1104; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
1105; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1106; SSSE3-NEXT:    pxor %xmm2, %xmm3
1107; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1108; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1109; SSSE3-NEXT:    psrad $24, %xmm0
1110; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1111; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1112; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1113; SSSE3-NEXT:    pslld $31, %xmm1
1114; SSSE3-NEXT:    psrad $31, %xmm1
1115; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1116; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1117; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1118; SSSE3-NEXT:    pslld $31, %xmm2
1119; SSSE3-NEXT:    psrad $31, %xmm2
1120; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1121; SSSE3-NEXT:    pslld $31, %xmm3
1122; SSSE3-NEXT:    psrad $31, %xmm3
1123; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
1124; SSSE3-NEXT:    retq
1125;
1126; SSE41-LABEL: umulo_v16i8:
1127; SSE41:       # %bb.0:
1128; SSE41-NEXT:    pxor %xmm2, %xmm2
1129; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1130; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1131; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1132; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1133; SSE41-NEXT:    pmullw %xmm1, %xmm0
1134; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1135; SSE41-NEXT:    movdqa %xmm0, %xmm1
1136; SSE41-NEXT:    pand %xmm4, %xmm1
1137; SSE41-NEXT:    pmullw %xmm3, %xmm5
1138; SSE41-NEXT:    pand %xmm5, %xmm4
1139; SSE41-NEXT:    packuswb %xmm1, %xmm4
1140; SSE41-NEXT:    psrlw $8, %xmm0
1141; SSE41-NEXT:    psrlw $8, %xmm5
1142; SSE41-NEXT:    packuswb %xmm0, %xmm5
1143; SSE41-NEXT:    pcmpeqb %xmm2, %xmm5
1144; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
1145; SSE41-NEXT:    pxor %xmm5, %xmm3
1146; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
1147; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
1148; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1149; SSE41-NEXT:    pslld $31, %xmm1
1150; SSE41-NEXT:    psrad $31, %xmm1
1151; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1152; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1153; SSE41-NEXT:    pslld $31, %xmm2
1154; SSE41-NEXT:    psrad $31, %xmm2
1155; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
1156; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1157; SSE41-NEXT:    pslld $31, %xmm3
1158; SSE41-NEXT:    psrad $31, %xmm3
1159; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1160; SSE41-NEXT:    retq
1161;
1162; AVX1-LABEL: umulo_v16i8:
1163; AVX1:       # %bb.0:
1164; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1165; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1166; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1167; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1168; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1169; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm5
1170; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1171; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1172; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1173; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
1174; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm4
1175; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1176; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1177; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1178; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1179; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1180; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1181; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1182; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1183; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1184; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1185; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1186; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1187; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1188; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1189; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1190; AVX1-NEXT:    vmovdqa %xmm4, (%rdi)
1191; AVX1-NEXT:    retq
1192;
1193; AVX2-LABEL: umulo_v16i8:
1194; AVX2:       # %bb.0:
1195; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1196; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1197; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1198; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1199; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1200; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm2
1201; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1202; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1203; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1204; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1206; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1207; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1208; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1209; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1210; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1211; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1212; AVX2-NEXT:    retq
1213;
1214; AVX512F-LABEL: umulo_v16i8:
1215; AVX512F:       # %bb.0:
1216; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1217; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1218; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1219; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm0
1220; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1221; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
1222; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1223; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1224; AVX512F-NEXT:    vpmovdb %zmm1, (%rdi)
1225; AVX512F-NEXT:    retq
1226;
1227; AVX512BW-LABEL: umulo_v16i8:
1228; AVX512BW:       # %bb.0:
1229; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1230; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1231; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1232; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm0
1233; AVX512BW-NEXT:    vptestmw %ymm0, %ymm0, %k1
1234; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1235; AVX512BW-NEXT:    vpmovwb %ymm1, (%rdi)
1236; AVX512BW-NEXT:    retq
1237  %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1238  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
1239  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
1240  %res = sext <16 x i1> %obit to <16 x i32>
1241  store <16 x i8> %val, ptr %p2
1242  ret <16 x i32> %res
1243}
1244
1245define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind {
1246; SSE2-LABEL: umulo_v32i8:
1247; SSE2:       # %bb.0:
1248; SSE2-NEXT:    movq %rdi, %rax
1249; SSE2-NEXT:    pxor %xmm5, %xmm5
1250; SSE2-NEXT:    movdqa %xmm2, %xmm4
1251; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1252; SSE2-NEXT:    movdqa %xmm0, %xmm6
1253; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1254; SSE2-NEXT:    pmullw %xmm4, %xmm6
1255; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
1256; SSE2-NEXT:    movdqa %xmm6, %xmm7
1257; SSE2-NEXT:    pand %xmm11, %xmm7
1258; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1259; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1260; SSE2-NEXT:    pmullw %xmm2, %xmm0
1261; SSE2-NEXT:    movdqa %xmm0, %xmm8
1262; SSE2-NEXT:    pand %xmm11, %xmm8
1263; SSE2-NEXT:    packuswb %xmm7, %xmm8
1264; SSE2-NEXT:    movdqa %xmm3, %xmm7
1265; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1266; SSE2-NEXT:    movdqa %xmm1, %xmm2
1267; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1268; SSE2-NEXT:    pmullw %xmm7, %xmm2
1269; SSE2-NEXT:    movdqa %xmm2, %xmm7
1270; SSE2-NEXT:    pand %xmm11, %xmm7
1271; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1272; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1273; SSE2-NEXT:    pmullw %xmm3, %xmm1
1274; SSE2-NEXT:    pand %xmm1, %xmm11
1275; SSE2-NEXT:    packuswb %xmm7, %xmm11
1276; SSE2-NEXT:    psrlw $8, %xmm2
1277; SSE2-NEXT:    psrlw $8, %xmm1
1278; SSE2-NEXT:    packuswb %xmm2, %xmm1
1279; SSE2-NEXT:    pcmpeqb %xmm5, %xmm1
1280; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1281; SSE2-NEXT:    pxor %xmm2, %xmm1
1282; SSE2-NEXT:    psrlw $8, %xmm6
1283; SSE2-NEXT:    psrlw $8, %xmm0
1284; SSE2-NEXT:    packuswb %xmm6, %xmm0
1285; SSE2-NEXT:    pcmpeqb %xmm5, %xmm0
1286; SSE2-NEXT:    pxor %xmm2, %xmm0
1287; SSE2-NEXT:    movdqa %xmm0, %xmm3
1288; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1289; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1290; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1291; SSE2-NEXT:    pslld $31, %xmm0
1292; SSE2-NEXT:    psrad $31, %xmm0
1293; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1294; SSE2-NEXT:    movdqa %xmm3, %xmm5
1295; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1296; SSE2-NEXT:    pslld $31, %xmm5
1297; SSE2-NEXT:    psrad $31, %xmm5
1298; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1299; SSE2-NEXT:    pslld $31, %xmm3
1300; SSE2-NEXT:    psrad $31, %xmm3
1301; SSE2-NEXT:    movdqa %xmm1, %xmm6
1302; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1303; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1304; SSE2-NEXT:    pslld $31, %xmm6
1305; SSE2-NEXT:    psrad $31, %xmm6
1306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
1307; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1308; SSE2-NEXT:    movdqa %xmm1, %xmm2
1309; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1310; SSE2-NEXT:    pslld $31, %xmm2
1311; SSE2-NEXT:    psrad $31, %xmm2
1312; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1313; SSE2-NEXT:    pslld $31, %xmm1
1314; SSE2-NEXT:    psrad $31, %xmm1
1315; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
1316; SSE2-NEXT:    psrad $24, %xmm7
1317; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
1318; SSE2-NEXT:    psrad $24, %xmm4
1319; SSE2-NEXT:    movdqa %xmm11, 16(%rsi)
1320; SSE2-NEXT:    movdqa %xmm8, (%rsi)
1321; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
1322; SSE2-NEXT:    movdqa %xmm7, (%rdi)
1323; SSE2-NEXT:    movdqa %xmm1, 112(%rdi)
1324; SSE2-NEXT:    movdqa %xmm2, 96(%rdi)
1325; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
1326; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
1327; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
1328; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1329; SSE2-NEXT:    retq
1330;
1331; SSSE3-LABEL: umulo_v32i8:
1332; SSSE3:       # %bb.0:
1333; SSSE3-NEXT:    movq %rdi, %rax
1334; SSSE3-NEXT:    pxor %xmm5, %xmm5
1335; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1336; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1337; SSSE3-NEXT:    movdqa %xmm0, %xmm6
1338; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1339; SSSE3-NEXT:    pmullw %xmm4, %xmm6
1340; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
1341; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1342; SSSE3-NEXT:    pand %xmm11, %xmm7
1343; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1344; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1345; SSSE3-NEXT:    pmullw %xmm2, %xmm0
1346; SSSE3-NEXT:    movdqa %xmm0, %xmm8
1347; SSSE3-NEXT:    pand %xmm11, %xmm8
1348; SSSE3-NEXT:    packuswb %xmm7, %xmm8
1349; SSSE3-NEXT:    movdqa %xmm3, %xmm7
1350; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1351; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1352; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1353; SSSE3-NEXT:    pmullw %xmm7, %xmm2
1354; SSSE3-NEXT:    movdqa %xmm2, %xmm7
1355; SSSE3-NEXT:    pand %xmm11, %xmm7
1356; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1357; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1358; SSSE3-NEXT:    pmullw %xmm3, %xmm1
1359; SSSE3-NEXT:    pand %xmm1, %xmm11
1360; SSSE3-NEXT:    packuswb %xmm7, %xmm11
1361; SSSE3-NEXT:    psrlw $8, %xmm2
1362; SSSE3-NEXT:    psrlw $8, %xmm1
1363; SSSE3-NEXT:    packuswb %xmm2, %xmm1
1364; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm1
1365; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
1366; SSSE3-NEXT:    pxor %xmm2, %xmm1
1367; SSSE3-NEXT:    psrlw $8, %xmm6
1368; SSSE3-NEXT:    psrlw $8, %xmm0
1369; SSSE3-NEXT:    packuswb %xmm6, %xmm0
1370; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm0
1371; SSSE3-NEXT:    pxor %xmm2, %xmm0
1372; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1373; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1374; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1375; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1376; SSSE3-NEXT:    pslld $31, %xmm0
1377; SSSE3-NEXT:    psrad $31, %xmm0
1378; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1379; SSSE3-NEXT:    movdqa %xmm3, %xmm5
1380; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1381; SSSE3-NEXT:    pslld $31, %xmm5
1382; SSSE3-NEXT:    psrad $31, %xmm5
1383; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1384; SSSE3-NEXT:    pslld $31, %xmm3
1385; SSSE3-NEXT:    psrad $31, %xmm3
1386; SSSE3-NEXT:    movdqa %xmm1, %xmm6
1387; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1388; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1389; SSSE3-NEXT:    pslld $31, %xmm6
1390; SSSE3-NEXT:    psrad $31, %xmm6
1391; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
1392; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1393; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1394; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1395; SSSE3-NEXT:    pslld $31, %xmm2
1396; SSSE3-NEXT:    psrad $31, %xmm2
1397; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1398; SSSE3-NEXT:    pslld $31, %xmm1
1399; SSSE3-NEXT:    psrad $31, %xmm1
1400; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
1401; SSSE3-NEXT:    psrad $24, %xmm7
1402; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
1403; SSSE3-NEXT:    psrad $24, %xmm4
1404; SSSE3-NEXT:    movdqa %xmm11, 16(%rsi)
1405; SSSE3-NEXT:    movdqa %xmm8, (%rsi)
1406; SSSE3-NEXT:    movdqa %xmm4, 64(%rdi)
1407; SSSE3-NEXT:    movdqa %xmm7, (%rdi)
1408; SSSE3-NEXT:    movdqa %xmm1, 112(%rdi)
1409; SSSE3-NEXT:    movdqa %xmm2, 96(%rdi)
1410; SSSE3-NEXT:    movdqa %xmm6, 80(%rdi)
1411; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
1412; SSSE3-NEXT:    movdqa %xmm5, 32(%rdi)
1413; SSSE3-NEXT:    movdqa %xmm0, 16(%rdi)
1414; SSSE3-NEXT:    retq
1415;
1416; SSE41-LABEL: umulo_v32i8:
1417; SSE41:       # %bb.0:
1418; SSE41-NEXT:    movq %rdi, %rax
1419; SSE41-NEXT:    pxor %xmm8, %xmm8
1420; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1421; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
1422; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1423; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1424; SSE41-NEXT:    pmullw %xmm2, %xmm0
1425; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
1426; SSE41-NEXT:    movdqa %xmm0, %xmm6
1427; SSE41-NEXT:    pand %xmm10, %xmm6
1428; SSE41-NEXT:    pmullw %xmm5, %xmm4
1429; SSE41-NEXT:    movdqa %xmm4, %xmm9
1430; SSE41-NEXT:    pand %xmm10, %xmm9
1431; SSE41-NEXT:    packuswb %xmm6, %xmm9
1432; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1433; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
1434; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1435; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1436; SSE41-NEXT:    pmullw %xmm3, %xmm1
1437; SSE41-NEXT:    movdqa %xmm1, %xmm3
1438; SSE41-NEXT:    pand %xmm10, %xmm3
1439; SSE41-NEXT:    pmullw %xmm7, %xmm6
1440; SSE41-NEXT:    pand %xmm6, %xmm10
1441; SSE41-NEXT:    packuswb %xmm3, %xmm10
1442; SSE41-NEXT:    psrlw $8, %xmm1
1443; SSE41-NEXT:    psrlw $8, %xmm6
1444; SSE41-NEXT:    packuswb %xmm1, %xmm6
1445; SSE41-NEXT:    pcmpeqb %xmm8, %xmm6
1446; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1447; SSE41-NEXT:    pxor %xmm1, %xmm6
1448; SSE41-NEXT:    psrlw $8, %xmm0
1449; SSE41-NEXT:    psrlw $8, %xmm4
1450; SSE41-NEXT:    packuswb %xmm0, %xmm4
1451; SSE41-NEXT:    pcmpeqb %xmm8, %xmm4
1452; SSE41-NEXT:    pxor %xmm1, %xmm4
1453; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
1454; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1455; SSE41-NEXT:    pslld $31, %xmm0
1456; SSE41-NEXT:    psrad $31, %xmm0
1457; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
1458; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1459; SSE41-NEXT:    pslld $31, %xmm1
1460; SSE41-NEXT:    psrad $31, %xmm1
1461; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
1462; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1463; SSE41-NEXT:    pslld $31, %xmm3
1464; SSE41-NEXT:    psrad $31, %xmm3
1465; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
1466; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
1467; SSE41-NEXT:    pslld $31, %xmm7
1468; SSE41-NEXT:    psrad $31, %xmm7
1469; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1470; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
1471; SSE41-NEXT:    pslld $31, %xmm5
1472; SSE41-NEXT:    psrad $31, %xmm5
1473; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3]
1474; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1475; SSE41-NEXT:    pslld $31, %xmm2
1476; SSE41-NEXT:    psrad $31, %xmm2
1477; SSE41-NEXT:    pmovsxbd %xmm4, %xmm4
1478; SSE41-NEXT:    pmovsxbd %xmm6, %xmm6
1479; SSE41-NEXT:    movdqa %xmm10, 16(%rsi)
1480; SSE41-NEXT:    movdqa %xmm9, (%rsi)
1481; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1482; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1483; SSE41-NEXT:    movdqa %xmm2, 112(%rdi)
1484; SSE41-NEXT:    movdqa %xmm5, 96(%rdi)
1485; SSE41-NEXT:    movdqa %xmm7, 80(%rdi)
1486; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
1487; SSE41-NEXT:    movdqa %xmm1, 32(%rdi)
1488; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1489; SSE41-NEXT:    retq
1490;
1491; AVX1-LABEL: umulo_v32i8:
1492; AVX1:       # %bb.0:
1493; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1494; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1495; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1496; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1497; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1498; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm4
1499; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1500; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1501; AVX1-NEXT:    vpmullw %xmm6, %xmm7, %xmm6
1502; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm7
1503; AVX1-NEXT:    vpackuswb %xmm4, %xmm7, %xmm8
1504; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1505; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1506; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1507; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1508; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm4
1509; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm7
1510; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1511; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1512; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1513; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm1
1514; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm5
1515; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm1
1516; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1517; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1518; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1519; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1520; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm4
1521; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm0
1522; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm3
1523; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1524; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1525; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1526; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1527; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1528; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1529; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1530; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm2
1531; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
1532; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1533; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1534; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1535; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1536; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1537; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1538; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
1539; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
1540; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1541; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
1542; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
1543; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
1544; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
1545; AVX1-NEXT:    vmovdqa %xmm8, (%rdi)
1546; AVX1-NEXT:    retq
1547;
1548; AVX2-LABEL: umulo_v32i8:
1549; AVX2:       # %bb.0:
1550; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1551; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
1552; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1553; AVX2-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
1554; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1555; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm5
1556; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
1557; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1558; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1559; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm1
1560; AVX2-NEXT:    vpackuswb %ymm5, %ymm1, %ymm4
1561; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm1
1562; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1563; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1564; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
1565; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1566; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
1567; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1568; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1569; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm2
1570; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1571; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1572; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1573; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
1574; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1575; AVX2-NEXT:    retq
1576;
1577; AVX512F-LABEL: umulo_v32i8:
1578; AVX512F:       # %bb.0:
1579; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
1580; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1581; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
1582; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
1583; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
1584; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm3
1585; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1586; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
1587; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1588; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1589; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm3
1590; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm0
1591; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1592; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
1593; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
1594; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
1595; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1596; AVX512F-NEXT:    vpmovdb %zmm2, 16(%rdi)
1597; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1598; AVX512F-NEXT:    vpmovdb %zmm2, (%rdi)
1599; AVX512F-NEXT:    retq
1600;
1601; AVX512BW-LABEL: umulo_v32i8:
1602; AVX512BW:       # %bb.0:
1603; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1604; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1605; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm2
1606; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm0
1607; AVX512BW-NEXT:    vptestmw %zmm0, %zmm0, %k1
1608; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1609; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
1610; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
1611; AVX512BW-NEXT:    vpmovwb %zmm2, (%rdi)
1612; AVX512BW-NEXT:    retq
1613  %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1614  %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0
1615  %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1
1616  %res = sext <32 x i1> %obit to <32 x i32>
1617  store <32 x i8> %val, ptr %p2
1618  ret <32 x i32> %res
1619}
1620
1621define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind {
1622; SSE2-LABEL: umulo_v64i8:
1623; SSE2:       # %bb.0:
1624; SSE2-NEXT:    movq %rdi, %rax
1625; SSE2-NEXT:    pxor %xmm9, %xmm9
1626; SSE2-NEXT:    movdqa %xmm4, %xmm8
1627; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
1628; SSE2-NEXT:    movdqa %xmm0, %xmm10
1629; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
1630; SSE2-NEXT:    pmullw %xmm8, %xmm10
1631; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1632; SSE2-NEXT:    movdqa %xmm10, %xmm12
1633; SSE2-NEXT:    pand %xmm8, %xmm12
1634; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1635; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1636; SSE2-NEXT:    pmullw %xmm4, %xmm0
1637; SSE2-NEXT:    movdqa %xmm0, %xmm11
1638; SSE2-NEXT:    pand %xmm8, %xmm11
1639; SSE2-NEXT:    packuswb %xmm12, %xmm11
1640; SSE2-NEXT:    movdqa %xmm5, %xmm4
1641; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1642; SSE2-NEXT:    movdqa %xmm1, %xmm13
1643; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
1644; SSE2-NEXT:    pmullw %xmm4, %xmm13
1645; SSE2-NEXT:    movdqa %xmm13, %xmm4
1646; SSE2-NEXT:    pand %xmm8, %xmm4
1647; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1648; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1649; SSE2-NEXT:    pmullw %xmm5, %xmm1
1650; SSE2-NEXT:    movdqa %xmm1, %xmm12
1651; SSE2-NEXT:    pand %xmm8, %xmm12
1652; SSE2-NEXT:    packuswb %xmm4, %xmm12
1653; SSE2-NEXT:    movdqa %xmm6, %xmm4
1654; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1655; SSE2-NEXT:    movdqa %xmm2, %xmm5
1656; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
1657; SSE2-NEXT:    pmullw %xmm4, %xmm5
1658; SSE2-NEXT:    movdqa %xmm5, %xmm4
1659; SSE2-NEXT:    pand %xmm8, %xmm4
1660; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1661; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
1662; SSE2-NEXT:    pmullw %xmm6, %xmm2
1663; SSE2-NEXT:    movdqa %xmm2, %xmm14
1664; SSE2-NEXT:    pand %xmm8, %xmm14
1665; SSE2-NEXT:    packuswb %xmm4, %xmm14
1666; SSE2-NEXT:    movdqa %xmm7, %xmm4
1667; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1668; SSE2-NEXT:    movdqa %xmm3, %xmm6
1669; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
1670; SSE2-NEXT:    pmullw %xmm4, %xmm6
1671; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
1672; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
1673; SSE2-NEXT:    pmullw %xmm7, %xmm3
1674; SSE2-NEXT:    movdqa %xmm6, %xmm4
1675; SSE2-NEXT:    pand %xmm8, %xmm4
1676; SSE2-NEXT:    pand %xmm3, %xmm8
1677; SSE2-NEXT:    packuswb %xmm4, %xmm8
1678; SSE2-NEXT:    psrlw $8, %xmm6
1679; SSE2-NEXT:    psrlw $8, %xmm3
1680; SSE2-NEXT:    packuswb %xmm6, %xmm3
1681; SSE2-NEXT:    psrlw $8, %xmm5
1682; SSE2-NEXT:    psrlw $8, %xmm2
1683; SSE2-NEXT:    packuswb %xmm5, %xmm2
1684; SSE2-NEXT:    psrlw $8, %xmm13
1685; SSE2-NEXT:    psrlw $8, %xmm1
1686; SSE2-NEXT:    packuswb %xmm13, %xmm1
1687; SSE2-NEXT:    psrlw $8, %xmm10
1688; SSE2-NEXT:    psrlw $8, %xmm0
1689; SSE2-NEXT:    packuswb %xmm10, %xmm0
1690; SSE2-NEXT:    pcmpeqb %xmm9, %xmm3
1691; SSE2-NEXT:    pcmpeqb %xmm9, %xmm2
1692; SSE2-NEXT:    pcmpeqb %xmm9, %xmm1
1693; SSE2-NEXT:    pcmpeqb %xmm9, %xmm0
1694; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
1695; SSE2-NEXT:    pxor %xmm4, %xmm3
1696; SSE2-NEXT:    pxor %xmm4, %xmm2
1697; SSE2-NEXT:    pxor %xmm4, %xmm1
1698; SSE2-NEXT:    pxor %xmm4, %xmm0
1699; SSE2-NEXT:    movdqa %xmm8, 48(%rsi)
1700; SSE2-NEXT:    movdqa %xmm14, 32(%rsi)
1701; SSE2-NEXT:    movdqa %xmm12, 16(%rsi)
1702; SSE2-NEXT:    movdqa %xmm3, %xmm4
1703; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1704; SSE2-NEXT:    movdqa %xmm11, (%rsi)
1705; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1706; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1707; SSE2-NEXT:    psrad $24, %xmm5
1708; SSE2-NEXT:    movdqa %xmm5, 192(%rdi)
1709; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1710; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1711; SSE2-NEXT:    psrad $24, %xmm5
1712; SSE2-NEXT:    movdqa %xmm5, 128(%rdi)
1713; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
1714; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1715; SSE2-NEXT:    psrad $24, %xmm5
1716; SSE2-NEXT:    movdqa %xmm5, 64(%rdi)
1717; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1718; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1719; SSE2-NEXT:    psrad $24, %xmm5
1720; SSE2-NEXT:    movdqa %xmm5, (%rdi)
1721; SSE2-NEXT:    movdqa %xmm4, %xmm5
1722; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1723; SSE2-NEXT:    pslld $31, %xmm4
1724; SSE2-NEXT:    psrad $31, %xmm4
1725; SSE2-NEXT:    movdqa %xmm4, 224(%rdi)
1726; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1727; SSE2-NEXT:    pslld $31, %xmm5
1728; SSE2-NEXT:    psrad $31, %xmm5
1729; SSE2-NEXT:    movdqa %xmm5, 240(%rdi)
1730; SSE2-NEXT:    movdqa %xmm2, %xmm4
1731; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1732; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1733; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1734; SSE2-NEXT:    pslld $31, %xmm3
1735; SSE2-NEXT:    psrad $31, %xmm3
1736; SSE2-NEXT:    movdqa %xmm3, 208(%rdi)
1737; SSE2-NEXT:    movdqa %xmm4, %xmm3
1738; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1739; SSE2-NEXT:    pslld $31, %xmm4
1740; SSE2-NEXT:    psrad $31, %xmm4
1741; SSE2-NEXT:    movdqa %xmm4, 160(%rdi)
1742; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1743; SSE2-NEXT:    pslld $31, %xmm3
1744; SSE2-NEXT:    psrad $31, %xmm3
1745; SSE2-NEXT:    movdqa %xmm3, 176(%rdi)
1746; SSE2-NEXT:    movdqa %xmm1, %xmm3
1747; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1748; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1749; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1750; SSE2-NEXT:    pslld $31, %xmm2
1751; SSE2-NEXT:    psrad $31, %xmm2
1752; SSE2-NEXT:    movdqa %xmm2, 144(%rdi)
1753; SSE2-NEXT:    movdqa %xmm3, %xmm2
1754; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1755; SSE2-NEXT:    pslld $31, %xmm3
1756; SSE2-NEXT:    psrad $31, %xmm3
1757; SSE2-NEXT:    movdqa %xmm3, 96(%rdi)
1758; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1759; SSE2-NEXT:    pslld $31, %xmm2
1760; SSE2-NEXT:    psrad $31, %xmm2
1761; SSE2-NEXT:    movdqa %xmm2, 112(%rdi)
1762; SSE2-NEXT:    movdqa %xmm0, %xmm2
1763; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1764; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1765; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1766; SSE2-NEXT:    pslld $31, %xmm1
1767; SSE2-NEXT:    psrad $31, %xmm1
1768; SSE2-NEXT:    movdqa %xmm1, 80(%rdi)
1769; SSE2-NEXT:    movdqa %xmm2, %xmm1
1770; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1771; SSE2-NEXT:    pslld $31, %xmm2
1772; SSE2-NEXT:    psrad $31, %xmm2
1773; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
1774; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1775; SSE2-NEXT:    pslld $31, %xmm1
1776; SSE2-NEXT:    psrad $31, %xmm1
1777; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1778; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1779; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1780; SSE2-NEXT:    pslld $31, %xmm0
1781; SSE2-NEXT:    psrad $31, %xmm0
1782; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1783; SSE2-NEXT:    retq
1784;
1785; SSSE3-LABEL: umulo_v64i8:
1786; SSSE3:       # %bb.0:
1787; SSSE3-NEXT:    movq %rdi, %rax
1788; SSSE3-NEXT:    pxor %xmm9, %xmm9
1789; SSSE3-NEXT:    movdqa %xmm4, %xmm8
1790; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
1791; SSSE3-NEXT:    movdqa %xmm0, %xmm10
1792; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
1793; SSSE3-NEXT:    pmullw %xmm8, %xmm10
1794; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1795; SSSE3-NEXT:    movdqa %xmm10, %xmm12
1796; SSSE3-NEXT:    pand %xmm8, %xmm12
1797; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1798; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1799; SSSE3-NEXT:    pmullw %xmm4, %xmm0
1800; SSSE3-NEXT:    movdqa %xmm0, %xmm11
1801; SSSE3-NEXT:    pand %xmm8, %xmm11
1802; SSSE3-NEXT:    packuswb %xmm12, %xmm11
1803; SSSE3-NEXT:    movdqa %xmm5, %xmm4
1804; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1805; SSSE3-NEXT:    movdqa %xmm1, %xmm13
1806; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
1807; SSSE3-NEXT:    pmullw %xmm4, %xmm13
1808; SSSE3-NEXT:    movdqa %xmm13, %xmm4
1809; SSSE3-NEXT:    pand %xmm8, %xmm4
1810; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1811; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1812; SSSE3-NEXT:    pmullw %xmm5, %xmm1
1813; SSSE3-NEXT:    movdqa %xmm1, %xmm12
1814; SSSE3-NEXT:    pand %xmm8, %xmm12
1815; SSSE3-NEXT:    packuswb %xmm4, %xmm12
1816; SSSE3-NEXT:    movdqa %xmm6, %xmm4
1817; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1818; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1819; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
1820; SSSE3-NEXT:    pmullw %xmm4, %xmm5
1821; SSSE3-NEXT:    movdqa %xmm5, %xmm4
1822; SSSE3-NEXT:    pand %xmm8, %xmm4
1823; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1824; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
1825; SSSE3-NEXT:    pmullw %xmm6, %xmm2
1826; SSSE3-NEXT:    movdqa %xmm2, %xmm14
1827; SSSE3-NEXT:    pand %xmm8, %xmm14
1828; SSSE3-NEXT:    packuswb %xmm4, %xmm14
1829; SSSE3-NEXT:    movdqa %xmm7, %xmm4
1830; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1831; SSSE3-NEXT:    movdqa %xmm3, %xmm6
1832; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
1833; SSSE3-NEXT:    pmullw %xmm4, %xmm6
1834; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
1835; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
1836; SSSE3-NEXT:    pmullw %xmm7, %xmm3
1837; SSSE3-NEXT:    movdqa %xmm6, %xmm4
1838; SSSE3-NEXT:    pand %xmm8, %xmm4
1839; SSSE3-NEXT:    pand %xmm3, %xmm8
1840; SSSE3-NEXT:    packuswb %xmm4, %xmm8
1841; SSSE3-NEXT:    psrlw $8, %xmm6
1842; SSSE3-NEXT:    psrlw $8, %xmm3
1843; SSSE3-NEXT:    packuswb %xmm6, %xmm3
1844; SSSE3-NEXT:    psrlw $8, %xmm5
1845; SSSE3-NEXT:    psrlw $8, %xmm2
1846; SSSE3-NEXT:    packuswb %xmm5, %xmm2
1847; SSSE3-NEXT:    psrlw $8, %xmm13
1848; SSSE3-NEXT:    psrlw $8, %xmm1
1849; SSSE3-NEXT:    packuswb %xmm13, %xmm1
1850; SSSE3-NEXT:    psrlw $8, %xmm10
1851; SSSE3-NEXT:    psrlw $8, %xmm0
1852; SSSE3-NEXT:    packuswb %xmm10, %xmm0
1853; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm3
1854; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm2
1855; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm1
1856; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm0
1857; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
1858; SSSE3-NEXT:    pxor %xmm4, %xmm3
1859; SSSE3-NEXT:    pxor %xmm4, %xmm2
1860; SSSE3-NEXT:    pxor %xmm4, %xmm1
1861; SSSE3-NEXT:    pxor %xmm4, %xmm0
1862; SSSE3-NEXT:    movdqa %xmm8, 48(%rsi)
1863; SSSE3-NEXT:    movdqa %xmm14, 32(%rsi)
1864; SSSE3-NEXT:    movdqa %xmm12, 16(%rsi)
1865; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1866; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1867; SSSE3-NEXT:    movdqa %xmm11, (%rsi)
1868; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1869; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1870; SSSE3-NEXT:    psrad $24, %xmm5
1871; SSSE3-NEXT:    movdqa %xmm5, 192(%rdi)
1872; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1873; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1874; SSSE3-NEXT:    psrad $24, %xmm5
1875; SSSE3-NEXT:    movdqa %xmm5, 128(%rdi)
1876; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
1877; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1878; SSSE3-NEXT:    psrad $24, %xmm5
1879; SSSE3-NEXT:    movdqa %xmm5, 64(%rdi)
1880; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1881; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1882; SSSE3-NEXT:    psrad $24, %xmm5
1883; SSSE3-NEXT:    movdqa %xmm5, (%rdi)
1884; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1885; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1886; SSSE3-NEXT:    pslld $31, %xmm4
1887; SSSE3-NEXT:    psrad $31, %xmm4
1888; SSSE3-NEXT:    movdqa %xmm4, 224(%rdi)
1889; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1890; SSSE3-NEXT:    pslld $31, %xmm5
1891; SSSE3-NEXT:    psrad $31, %xmm5
1892; SSSE3-NEXT:    movdqa %xmm5, 240(%rdi)
1893; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1894; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1895; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1896; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1897; SSSE3-NEXT:    pslld $31, %xmm3
1898; SSSE3-NEXT:    psrad $31, %xmm3
1899; SSSE3-NEXT:    movdqa %xmm3, 208(%rdi)
1900; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1901; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1902; SSSE3-NEXT:    pslld $31, %xmm4
1903; SSSE3-NEXT:    psrad $31, %xmm4
1904; SSSE3-NEXT:    movdqa %xmm4, 160(%rdi)
1905; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1906; SSSE3-NEXT:    pslld $31, %xmm3
1907; SSSE3-NEXT:    psrad $31, %xmm3
1908; SSSE3-NEXT:    movdqa %xmm3, 176(%rdi)
1909; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1910; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1911; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1912; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1913; SSSE3-NEXT:    pslld $31, %xmm2
1914; SSSE3-NEXT:    psrad $31, %xmm2
1915; SSSE3-NEXT:    movdqa %xmm2, 144(%rdi)
1916; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1917; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1918; SSSE3-NEXT:    pslld $31, %xmm3
1919; SSSE3-NEXT:    psrad $31, %xmm3
1920; SSSE3-NEXT:    movdqa %xmm3, 96(%rdi)
1921; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1922; SSSE3-NEXT:    pslld $31, %xmm2
1923; SSSE3-NEXT:    psrad $31, %xmm2
1924; SSSE3-NEXT:    movdqa %xmm2, 112(%rdi)
1925; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1926; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1927; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1928; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1929; SSSE3-NEXT:    pslld $31, %xmm1
1930; SSSE3-NEXT:    psrad $31, %xmm1
1931; SSSE3-NEXT:    movdqa %xmm1, 80(%rdi)
1932; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1933; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1934; SSSE3-NEXT:    pslld $31, %xmm2
1935; SSSE3-NEXT:    psrad $31, %xmm2
1936; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
1937; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1938; SSSE3-NEXT:    pslld $31, %xmm1
1939; SSSE3-NEXT:    psrad $31, %xmm1
1940; SSSE3-NEXT:    movdqa %xmm1, 48(%rdi)
1941; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1942; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1943; SSSE3-NEXT:    pslld $31, %xmm0
1944; SSSE3-NEXT:    psrad $31, %xmm0
1945; SSSE3-NEXT:    movdqa %xmm0, 16(%rdi)
1946; SSSE3-NEXT:    retq
1947;
1948; SSE41-LABEL: umulo_v64i8:
1949; SSE41:       # %bb.0:
1950; SSE41-NEXT:    movq %rdi, %rax
1951; SSE41-NEXT:    pxor %xmm13, %xmm13
1952; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
1953; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
1954; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1955; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1956; SSE41-NEXT:    pmullw %xmm4, %xmm0
1957; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
1958; SSE41-NEXT:    movdqa %xmm0, %xmm4
1959; SSE41-NEXT:    pand %xmm9, %xmm4
1960; SSE41-NEXT:    pmullw %xmm10, %xmm8
1961; SSE41-NEXT:    movdqa %xmm8, %xmm10
1962; SSE41-NEXT:    pand %xmm9, %xmm10
1963; SSE41-NEXT:    packuswb %xmm4, %xmm10
1964; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
1965; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
1966; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1967; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
1968; SSE41-NEXT:    pmullw %xmm5, %xmm1
1969; SSE41-NEXT:    movdqa %xmm1, %xmm5
1970; SSE41-NEXT:    pand %xmm9, %xmm5
1971; SSE41-NEXT:    pmullw %xmm11, %xmm4
1972; SSE41-NEXT:    movdqa %xmm4, %xmm11
1973; SSE41-NEXT:    pand %xmm9, %xmm11
1974; SSE41-NEXT:    packuswb %xmm5, %xmm11
1975; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
1976; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15]
1977; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1978; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
1979; SSE41-NEXT:    pmullw %xmm6, %xmm2
1980; SSE41-NEXT:    movdqa %xmm2, %xmm6
1981; SSE41-NEXT:    pand %xmm9, %xmm6
1982; SSE41-NEXT:    pmullw %xmm12, %xmm5
1983; SSE41-NEXT:    movdqa %xmm5, %xmm12
1984; SSE41-NEXT:    pand %xmm9, %xmm12
1985; SSE41-NEXT:    packuswb %xmm6, %xmm12
1986; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
1987; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15]
1988; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1989; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
1990; SSE41-NEXT:    pmullw %xmm7, %xmm3
1991; SSE41-NEXT:    pmullw %xmm14, %xmm6
1992; SSE41-NEXT:    movdqa %xmm3, %xmm7
1993; SSE41-NEXT:    pand %xmm9, %xmm7
1994; SSE41-NEXT:    pand %xmm6, %xmm9
1995; SSE41-NEXT:    packuswb %xmm7, %xmm9
1996; SSE41-NEXT:    psrlw $8, %xmm3
1997; SSE41-NEXT:    psrlw $8, %xmm6
1998; SSE41-NEXT:    packuswb %xmm3, %xmm6
1999; SSE41-NEXT:    psrlw $8, %xmm2
2000; SSE41-NEXT:    psrlw $8, %xmm5
2001; SSE41-NEXT:    packuswb %xmm2, %xmm5
2002; SSE41-NEXT:    psrlw $8, %xmm1
2003; SSE41-NEXT:    psrlw $8, %xmm4
2004; SSE41-NEXT:    packuswb %xmm1, %xmm4
2005; SSE41-NEXT:    psrlw $8, %xmm0
2006; SSE41-NEXT:    psrlw $8, %xmm8
2007; SSE41-NEXT:    packuswb %xmm0, %xmm8
2008; SSE41-NEXT:    pcmpeqb %xmm13, %xmm6
2009; SSE41-NEXT:    pcmpeqb %xmm13, %xmm5
2010; SSE41-NEXT:    pcmpeqb %xmm13, %xmm4
2011; SSE41-NEXT:    pcmpeqb %xmm13, %xmm8
2012; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2013; SSE41-NEXT:    pxor %xmm0, %xmm6
2014; SSE41-NEXT:    pxor %xmm0, %xmm5
2015; SSE41-NEXT:    pxor %xmm0, %xmm4
2016; SSE41-NEXT:    pxor %xmm0, %xmm8
2017; SSE41-NEXT:    movdqa %xmm9, 48(%rsi)
2018; SSE41-NEXT:    movdqa %xmm12, 32(%rsi)
2019; SSE41-NEXT:    movdqa %xmm11, 16(%rsi)
2020; SSE41-NEXT:    movdqa %xmm10, (%rsi)
2021; SSE41-NEXT:    pmovsxbd %xmm6, %xmm0
2022; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
2023; SSE41-NEXT:    pmovsxbd %xmm5, %xmm0
2024; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
2025; SSE41-NEXT:    pmovsxbd %xmm4, %xmm0
2026; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
2027; SSE41-NEXT:    pmovsxbd %xmm8, %xmm0
2028; SSE41-NEXT:    movdqa %xmm0, (%rdi)
2029; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2030; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2031; SSE41-NEXT:    pslld $31, %xmm0
2032; SSE41-NEXT:    psrad $31, %xmm0
2033; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
2034; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2035; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2036; SSE41-NEXT:    pslld $31, %xmm0
2037; SSE41-NEXT:    psrad $31, %xmm0
2038; SSE41-NEXT:    movdqa %xmm0, 240(%rdi)
2039; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2040; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2041; SSE41-NEXT:    pslld $31, %xmm0
2042; SSE41-NEXT:    psrad $31, %xmm0
2043; SSE41-NEXT:    movdqa %xmm0, 208(%rdi)
2044; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2045; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2046; SSE41-NEXT:    pslld $31, %xmm0
2047; SSE41-NEXT:    psrad $31, %xmm0
2048; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
2049; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2050; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2051; SSE41-NEXT:    pslld $31, %xmm0
2052; SSE41-NEXT:    psrad $31, %xmm0
2053; SSE41-NEXT:    movdqa %xmm0, 176(%rdi)
2054; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2055; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2056; SSE41-NEXT:    pslld $31, %xmm0
2057; SSE41-NEXT:    psrad $31, %xmm0
2058; SSE41-NEXT:    movdqa %xmm0, 144(%rdi)
2059; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2060; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2061; SSE41-NEXT:    pslld $31, %xmm0
2062; SSE41-NEXT:    psrad $31, %xmm0
2063; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
2064; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
2065; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2066; SSE41-NEXT:    pslld $31, %xmm0
2067; SSE41-NEXT:    psrad $31, %xmm0
2068; SSE41-NEXT:    movdqa %xmm0, 112(%rdi)
2069; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2070; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2071; SSE41-NEXT:    pslld $31, %xmm0
2072; SSE41-NEXT:    psrad $31, %xmm0
2073; SSE41-NEXT:    movdqa %xmm0, 80(%rdi)
2074; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
2075; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2076; SSE41-NEXT:    pslld $31, %xmm0
2077; SSE41-NEXT:    psrad $31, %xmm0
2078; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
2079; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
2080; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2081; SSE41-NEXT:    pslld $31, %xmm0
2082; SSE41-NEXT:    psrad $31, %xmm0
2083; SSE41-NEXT:    movdqa %xmm0, 48(%rdi)
2084; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2085; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2086; SSE41-NEXT:    pslld $31, %xmm0
2087; SSE41-NEXT:    psrad $31, %xmm0
2088; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
2089; SSE41-NEXT:    retq
2090;
2091; AVX1-LABEL: umulo_v64i8:
2092; AVX1:       # %bb.0:
2093; AVX1-NEXT:    movq %rdi, %rax
2094; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
2095; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2096; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2097; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm9
2098; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2099; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm8
2100; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2101; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2102; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm11
2103; AVX1-NEXT:    vpand %xmm6, %xmm11, %xmm4
2104; AVX1-NEXT:    vpackuswb %xmm8, %xmm4, %xmm8
2105; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2106; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2107; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2108; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2109; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm12
2110; AVX1-NEXT:    vpand %xmm6, %xmm12, %xmm7
2111; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2112; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2113; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm13
2114; AVX1-NEXT:    vpand %xmm6, %xmm13, %xmm2
2115; AVX1-NEXT:    vpackuswb %xmm7, %xmm2, %xmm10
2116; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2117; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2118; AVX1-NEXT:    vpmullw %xmm2, %xmm7, %xmm7
2119; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm2
2120; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2121; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2122; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
2123; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm4
2124; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm14
2125; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2126; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2127; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2128; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2129; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
2130; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2131; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2132; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm3
2133; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm1
2134; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm4
2135; AVX1-NEXT:    vpackuswb %xmm1, %xmm4, %xmm15
2136; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2137; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
2138; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2139; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm3
2140; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2141; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2142; AVX1-NEXT:    vpsrlw $8, %xmm12, %xmm3
2143; AVX1-NEXT:    vpsrlw $8, %xmm13, %xmm4
2144; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
2145; AVX1-NEXT:    vpsrlw $8, %xmm9, %xmm4
2146; AVX1-NEXT:    vpsrlw $8, %xmm11, %xmm6
2147; AVX1-NEXT:    vpackuswb %xmm4, %xmm6, %xmm4
2148; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm2
2149; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm0
2150; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
2151; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm4, %xmm7
2152; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2153; AVX1-NEXT:    vpxor %xmm1, %xmm2, %xmm6
2154; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm4
2155; AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm5
2156; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm3
2157; AVX1-NEXT:    vmovdqa %xmm15, 48(%rsi)
2158; AVX1-NEXT:    vmovdqa %xmm14, 32(%rsi)
2159; AVX1-NEXT:    vmovdqa %xmm10, 16(%rsi)
2160; AVX1-NEXT:    vmovdqa %xmm8, (%rsi)
2161; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm0
2162; AVX1-NEXT:    vmovdqa %xmm0, 192(%rdi)
2163; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm0
2164; AVX1-NEXT:    vmovdqa %xmm0, 128(%rdi)
2165; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm0
2166; AVX1-NEXT:    vmovdqa %xmm0, 64(%rdi)
2167; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm0
2168; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
2169; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2170; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2171; AVX1-NEXT:    vmovdqa %xmm0, 224(%rdi)
2172; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2173; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2174; AVX1-NEXT:    vmovdqa %xmm0, 240(%rdi)
2175; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2176; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2177; AVX1-NEXT:    vmovdqa %xmm0, 208(%rdi)
2178; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2179; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2180; AVX1-NEXT:    vmovdqa %xmm0, 160(%rdi)
2181; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
2182; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2183; AVX1-NEXT:    vmovdqa %xmm0, 176(%rdi)
2184; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2185; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2186; AVX1-NEXT:    vmovdqa %xmm0, 144(%rdi)
2187; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2188; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2189; AVX1-NEXT:    vmovdqa %xmm0, 96(%rdi)
2190; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2191; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2192; AVX1-NEXT:    vmovdqa %xmm0, 112(%rdi)
2193; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2194; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2195; AVX1-NEXT:    vmovdqa %xmm0, 80(%rdi)
2196; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
2197; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2198; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdi)
2199; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
2200; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2201; AVX1-NEXT:    vmovdqa %xmm0, 48(%rdi)
2202; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2203; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2204; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdi)
2205; AVX1-NEXT:    vzeroupper
2206; AVX1-NEXT:    retq
2207;
2208; AVX2-LABEL: umulo_v64i8:
2209; AVX2:       # %bb.0:
2210; AVX2-NEXT:    movq %rdi, %rax
2211; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2212; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
2213; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
2214; AVX2-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
2215; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2216; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm7
2217; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
2218; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
2219; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
2220; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm0
2221; AVX2-NEXT:    vpackuswb %ymm7, %ymm0, %ymm9
2222; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
2223; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
2224; AVX2-NEXT:    vpmullw %ymm7, %ymm8, %ymm7
2225; AVX2-NEXT:    vpand %ymm6, %ymm7, %ymm8
2226; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
2227; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
2228; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2229; AVX2-NEXT:    vpand %ymm6, %ymm1, %ymm3
2230; AVX2-NEXT:    vpackuswb %ymm8, %ymm3, %ymm8
2231; AVX2-NEXT:    vpsrlw $8, %ymm7, %ymm6
2232; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2233; AVX2-NEXT:    vpackuswb %ymm6, %ymm1, %ymm1
2234; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm1
2235; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
2236; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
2237; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
2238; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
2239; AVX2-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
2240; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
2241; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
2242; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
2243; AVX2-NEXT:    vpmovsxbd %xmm4, %ymm4
2244; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
2245; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
2246; AVX2-NEXT:    vpmovsxbd %xmm6, %ymm6
2247; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
2248; AVX2-NEXT:    vpmovsxbd %xmm7, %ymm7
2249; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
2250; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
2251; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
2252; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
2253; AVX2-NEXT:    vpmovsxbd %xmm5, %ymm5
2254; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
2255; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
2256; AVX2-NEXT:    vmovdqa %ymm8, 32(%rsi)
2257; AVX2-NEXT:    vmovdqa %ymm9, (%rsi)
2258; AVX2-NEXT:    vmovdqa %ymm0, 192(%rdi)
2259; AVX2-NEXT:    vmovdqa %ymm1, 128(%rdi)
2260; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
2261; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
2262; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
2263; AVX2-NEXT:    vmovdqa %ymm7, 160(%rdi)
2264; AVX2-NEXT:    vmovdqa %ymm6, 96(%rdi)
2265; AVX2-NEXT:    vmovdqa %ymm4, 32(%rdi)
2266; AVX2-NEXT:    vzeroupper
2267; AVX2-NEXT:    retq
2268;
2269; AVX512F-LABEL: umulo_v64i8:
2270; AVX512F:       # %bb.0:
2271; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2272; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
2273; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2274; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
2275; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm4
2276; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2277; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
2278; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm3
2279; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
2280; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
2281; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2282; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2283; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm5
2284; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm2
2285; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2286; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k2
2287; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
2288; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2289; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
2290; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2291; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm6
2292; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm2
2293; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2294; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k3
2295; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2296; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2297; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm7
2298; AVX512F-NEXT:    vpsrlw $8, %ymm7, %ymm0
2299; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2300; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
2301; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
2302; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
2303; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
2304; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
2305; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
2306; AVX512F-NEXT:    vpmovdb %zmm4, 48(%rdi)
2307; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
2308; AVX512F-NEXT:    vpmovdb %zmm4, 32(%rdi)
2309; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2310; AVX512F-NEXT:    vpmovdb %zmm4, 16(%rdi)
2311; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2312; AVX512F-NEXT:    vpmovdb %zmm4, (%rdi)
2313; AVX512F-NEXT:    retq
2314;
2315; AVX512BW-LABEL: umulo_v64i8:
2316; AVX512BW:       # %bb.0:
2317; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2318; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
2319; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
2320; AVX512BW-NEXT:    vpmullw %zmm3, %zmm4, %zmm3
2321; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2322; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm5
2323; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
2324; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]
2325; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2326; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm1
2327; AVX512BW-NEXT:    vpackuswb %zmm5, %zmm1, %zmm4
2328; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm1
2329; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
2330; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
2331; AVX512BW-NEXT:    vptestmb %zmm0, %zmm0, %k1
2332; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2333; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
2334; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
2335; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
2336; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
2337; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
2338; AVX512BW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
2339; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdi)
2340; AVX512BW-NEXT:    retq
2341  %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1)
2342  %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0
2343  %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1
2344  %res = sext <64 x i1> %obit to <64 x i32>
2345  store <64 x i8> %val, ptr %p2
2346  ret <64 x i32> %res
2347}
2348
2349define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind {
2350; SSE2-LABEL: umulo_v8i16:
2351; SSE2:       # %bb.0:
2352; SSE2-NEXT:    movdqa %xmm0, %xmm2
2353; SSE2-NEXT:    pmullw %xmm1, %xmm2
2354; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
2355; SSE2-NEXT:    pxor %xmm0, %xmm0
2356; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
2357; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2358; SSE2-NEXT:    pxor %xmm0, %xmm1
2359; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2360; SSE2-NEXT:    psrad $16, %xmm0
2361; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2362; SSE2-NEXT:    pslld $31, %xmm1
2363; SSE2-NEXT:    psrad $31, %xmm1
2364; SSE2-NEXT:    movdqa %xmm2, (%rdi)
2365; SSE2-NEXT:    retq
2366;
2367; SSSE3-LABEL: umulo_v8i16:
2368; SSSE3:       # %bb.0:
2369; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2370; SSSE3-NEXT:    pmullw %xmm1, %xmm2
2371; SSSE3-NEXT:    pmulhuw %xmm0, %xmm1
2372; SSSE3-NEXT:    pxor %xmm0, %xmm0
2373; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm1
2374; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
2375; SSSE3-NEXT:    pxor %xmm0, %xmm1
2376; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2377; SSSE3-NEXT:    psrad $16, %xmm0
2378; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2379; SSSE3-NEXT:    pslld $31, %xmm1
2380; SSSE3-NEXT:    psrad $31, %xmm1
2381; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
2382; SSSE3-NEXT:    retq
2383;
2384; SSE41-LABEL: umulo_v8i16:
2385; SSE41:       # %bb.0:
2386; SSE41-NEXT:    movdqa %xmm0, %xmm2
2387; SSE41-NEXT:    pmullw %xmm1, %xmm2
2388; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
2389; SSE41-NEXT:    pxor %xmm0, %xmm0
2390; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
2391; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2392; SSE41-NEXT:    pxor %xmm0, %xmm1
2393; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
2394; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2395; SSE41-NEXT:    pslld $31, %xmm1
2396; SSE41-NEXT:    psrad $31, %xmm1
2397; SSE41-NEXT:    movdqa %xmm2, (%rdi)
2398; SSE41-NEXT:    retq
2399;
2400; AVX1-LABEL: umulo_v8i16:
2401; AVX1:       # %bb.0:
2402; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2403; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2404; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2405; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2406; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2407; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2408; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2409; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2410; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2411; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2412; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
2413; AVX1-NEXT:    retq
2414;
2415; AVX2-LABEL: umulo_v8i16:
2416; AVX2:       # %bb.0:
2417; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2418; AVX2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2419; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2420; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2421; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2422; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2423; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
2424; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
2425; AVX2-NEXT:    retq
2426;
2427; AVX512F-LABEL: umulo_v8i16:
2428; AVX512F:       # %bb.0:
2429; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2430; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2431; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2432; AVX512F-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2433; AVX512F-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
2434; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2435; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
2436; AVX512F-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2437; AVX512F-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2438; AVX512F-NEXT:    vmovdqa %xmm2, (%rdi)
2439; AVX512F-NEXT:    retq
2440;
2441; AVX512BW-LABEL: umulo_v8i16:
2442; AVX512BW:       # %bb.0:
2443; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2444; AVX512BW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2445; AVX512BW-NEXT:    vptestmw %xmm0, %xmm0, %k1
2446; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2447; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2448; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdi)
2449; AVX512BW-NEXT:    retq
2450  %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2451  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
2452  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
2453  %res = sext <8 x i1> %obit to <8 x i32>
2454  store <8 x i16> %val, ptr %p2
2455  ret <8 x i32> %res
2456}
2457
2458define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind {
2459; SSE2-LABEL: umulo_v2i64:
2460; SSE2:       # %bb.0:
2461; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2462; SSE2-NEXT:    movq %xmm2, %r8
2463; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2464; SSE2-NEXT:    movq %xmm2, %r10
2465; SSE2-NEXT:    movq %xmm0, %rax
2466; SSE2-NEXT:    movq %xmm1, %rdx
2467; SSE2-NEXT:    xorl %ecx, %ecx
2468; SSE2-NEXT:    mulq %rdx
2469; SSE2-NEXT:    movq $-1, %r9
2470; SSE2-NEXT:    movl $0, %esi
2471; SSE2-NEXT:    cmovoq %r9, %rsi
2472; SSE2-NEXT:    movq %rax, %xmm1
2473; SSE2-NEXT:    movq %r8, %rax
2474; SSE2-NEXT:    mulq %r10
2475; SSE2-NEXT:    movq %rax, %xmm0
2476; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2477; SSE2-NEXT:    movq %rsi, %xmm0
2478; SSE2-NEXT:    cmovoq %r9, %rcx
2479; SSE2-NEXT:    movq %rcx, %xmm2
2480; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2481; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2482; SSE2-NEXT:    movdqa %xmm1, (%rdi)
2483; SSE2-NEXT:    retq
2484;
2485; SSSE3-LABEL: umulo_v2i64:
2486; SSSE3:       # %bb.0:
2487; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2488; SSSE3-NEXT:    movq %xmm2, %r8
2489; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2490; SSSE3-NEXT:    movq %xmm2, %r10
2491; SSSE3-NEXT:    movq %xmm0, %rax
2492; SSSE3-NEXT:    movq %xmm1, %rdx
2493; SSSE3-NEXT:    xorl %ecx, %ecx
2494; SSSE3-NEXT:    mulq %rdx
2495; SSSE3-NEXT:    movq $-1, %r9
2496; SSSE3-NEXT:    movl $0, %esi
2497; SSSE3-NEXT:    cmovoq %r9, %rsi
2498; SSSE3-NEXT:    movq %rax, %xmm1
2499; SSSE3-NEXT:    movq %r8, %rax
2500; SSSE3-NEXT:    mulq %r10
2501; SSSE3-NEXT:    movq %rax, %xmm0
2502; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2503; SSSE3-NEXT:    movq %rsi, %xmm0
2504; SSSE3-NEXT:    cmovoq %r9, %rcx
2505; SSSE3-NEXT:    movq %rcx, %xmm2
2506; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2507; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2508; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
2509; SSSE3-NEXT:    retq
2510;
2511; SSE41-LABEL: umulo_v2i64:
2512; SSE41:       # %bb.0:
2513; SSE41-NEXT:    movq %xmm0, %r10
2514; SSE41-NEXT:    movq %xmm1, %r8
2515; SSE41-NEXT:    pextrq $1, %xmm0, %rax
2516; SSE41-NEXT:    pextrq $1, %xmm1, %rdx
2517; SSE41-NEXT:    xorl %esi, %esi
2518; SSE41-NEXT:    mulq %rdx
2519; SSE41-NEXT:    movq $-1, %r9
2520; SSE41-NEXT:    movl $0, %ecx
2521; SSE41-NEXT:    cmovoq %r9, %rcx
2522; SSE41-NEXT:    movq %rax, %xmm0
2523; SSE41-NEXT:    movq %r10, %rax
2524; SSE41-NEXT:    mulq %r8
2525; SSE41-NEXT:    movq %rax, %xmm1
2526; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2527; SSE41-NEXT:    movq %rcx, %xmm0
2528; SSE41-NEXT:    cmovoq %r9, %rsi
2529; SSE41-NEXT:    movq %rsi, %xmm2
2530; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2531; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2532; SSE41-NEXT:    movdqa %xmm1, (%rdi)
2533; SSE41-NEXT:    retq
2534;
2535; AVX-LABEL: umulo_v2i64:
2536; AVX:       # %bb.0:
2537; AVX-NEXT:    vmovq %xmm0, %r10
2538; AVX-NEXT:    vmovq %xmm1, %r8
2539; AVX-NEXT:    vpextrq $1, %xmm0, %rax
2540; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
2541; AVX-NEXT:    xorl %esi, %esi
2542; AVX-NEXT:    mulq %rdx
2543; AVX-NEXT:    movq $-1, %r9
2544; AVX-NEXT:    movl $0, %ecx
2545; AVX-NEXT:    cmovoq %r9, %rcx
2546; AVX-NEXT:    vmovq %rax, %xmm0
2547; AVX-NEXT:    movq %r10, %rax
2548; AVX-NEXT:    mulq %r8
2549; AVX-NEXT:    vmovq %rax, %xmm1
2550; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2551; AVX-NEXT:    vmovq %rcx, %xmm0
2552; AVX-NEXT:    cmovoq %r9, %rsi
2553; AVX-NEXT:    vmovq %rsi, %xmm2
2554; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2555; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2556; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
2557; AVX-NEXT:    retq
2558;
2559; AVX512F-LABEL: umulo_v2i64:
2560; AVX512F:       # %bb.0:
2561; AVX512F-NEXT:    vmovq %xmm0, %rcx
2562; AVX512F-NEXT:    vmovq %xmm1, %rsi
2563; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2564; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdx
2565; AVX512F-NEXT:    mulq %rdx
2566; AVX512F-NEXT:    seto %r8b
2567; AVX512F-NEXT:    vmovq %rax, %xmm0
2568; AVX512F-NEXT:    movq %rcx, %rax
2569; AVX512F-NEXT:    mulq %rsi
2570; AVX512F-NEXT:    vmovq %rax, %xmm1
2571; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2572; AVX512F-NEXT:    seto %al
2573; AVX512F-NEXT:    andl $1, %eax
2574; AVX512F-NEXT:    kmovw %eax, %k0
2575; AVX512F-NEXT:    kmovw %r8d, %k1
2576; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2577; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2578; AVX512F-NEXT:    korw %k1, %k0, %k1
2579; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2580; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2581; AVX512F-NEXT:    vmovdqa %xmm1, (%rdi)
2582; AVX512F-NEXT:    retq
2583;
2584; AVX512BW-LABEL: umulo_v2i64:
2585; AVX512BW:       # %bb.0:
2586; AVX512BW-NEXT:    vmovq %xmm0, %rcx
2587; AVX512BW-NEXT:    vmovq %xmm1, %rsi
2588; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
2589; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rdx
2590; AVX512BW-NEXT:    mulq %rdx
2591; AVX512BW-NEXT:    seto %r8b
2592; AVX512BW-NEXT:    vmovq %rax, %xmm0
2593; AVX512BW-NEXT:    movq %rcx, %rax
2594; AVX512BW-NEXT:    mulq %rsi
2595; AVX512BW-NEXT:    vmovq %rax, %xmm1
2596; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2597; AVX512BW-NEXT:    seto %al
2598; AVX512BW-NEXT:    andl $1, %eax
2599; AVX512BW-NEXT:    kmovw %eax, %k0
2600; AVX512BW-NEXT:    kmovd %r8d, %k1
2601; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
2602; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
2603; AVX512BW-NEXT:    korw %k1, %k0, %k1
2604; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2605; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2606; AVX512BW-NEXT:    vmovdqa %xmm1, (%rdi)
2607; AVX512BW-NEXT:    retq
2608  %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
2609  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
2610  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
2611  %res = sext <2 x i1> %obit to <2 x i32>
2612  store <2 x i64> %val, ptr %p2
2613  ret <2 x i32> %res
2614}
2615
2616define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind {
2617; SSE2-LABEL: umulo_v4i24:
2618; SSE2:       # %bb.0:
2619; SSE2-NEXT:    movdqa %xmm0, %xmm2
2620; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2621; SSE2-NEXT:    pand %xmm0, %xmm1
2622; SSE2-NEXT:    pand %xmm0, %xmm2
2623; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2624; SSE2-NEXT:    pmuludq %xmm1, %xmm2
2625; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2626; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2627; SSE2-NEXT:    pmuludq %xmm0, %xmm1
2628; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2629; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2630; SSE2-NEXT:    pxor %xmm4, %xmm4
2631; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
2632; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
2633; SSE2-NEXT:    pxor %xmm3, %xmm5
2634; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2635; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2636; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2637; SSE2-NEXT:    psrld $24, %xmm0
2638; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
2639; SSE2-NEXT:    por %xmm5, %xmm0
2640; SSE2-NEXT:    movd %xmm2, %eax
2641; SSE2-NEXT:    movw %ax, (%rdi)
2642; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2643; SSE2-NEXT:    movd %xmm2, %ecx
2644; SSE2-NEXT:    movw %cx, 6(%rdi)
2645; SSE2-NEXT:    movd %xmm1, %edx
2646; SSE2-NEXT:    movw %dx, 3(%rdi)
2647; SSE2-NEXT:    shrl $16, %eax
2648; SSE2-NEXT:    movb %al, 2(%rdi)
2649; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2650; SSE2-NEXT:    movd %xmm1, %eax
2651; SSE2-NEXT:    movw %ax, 9(%rdi)
2652; SSE2-NEXT:    shrl $16, %ecx
2653; SSE2-NEXT:    movb %cl, 8(%rdi)
2654; SSE2-NEXT:    shrl $16, %edx
2655; SSE2-NEXT:    movb %dl, 5(%rdi)
2656; SSE2-NEXT:    shrl $16, %eax
2657; SSE2-NEXT:    movb %al, 11(%rdi)
2658; SSE2-NEXT:    retq
2659;
2660; SSSE3-LABEL: umulo_v4i24:
2661; SSSE3:       # %bb.0:
2662; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2663; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2664; SSSE3-NEXT:    pand %xmm0, %xmm1
2665; SSSE3-NEXT:    pand %xmm0, %xmm2
2666; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2667; SSSE3-NEXT:    pmuludq %xmm1, %xmm2
2668; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2669; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2670; SSSE3-NEXT:    pmuludq %xmm0, %xmm1
2671; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2672; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2673; SSSE3-NEXT:    pxor %xmm4, %xmm4
2674; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
2675; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
2676; SSSE3-NEXT:    pxor %xmm3, %xmm5
2677; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2678; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2679; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2680; SSSE3-NEXT:    psrld $24, %xmm0
2681; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
2682; SSSE3-NEXT:    por %xmm5, %xmm0
2683; SSSE3-NEXT:    movd %xmm2, %eax
2684; SSSE3-NEXT:    movw %ax, (%rdi)
2685; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2686; SSSE3-NEXT:    movd %xmm2, %ecx
2687; SSSE3-NEXT:    movw %cx, 6(%rdi)
2688; SSSE3-NEXT:    movd %xmm1, %edx
2689; SSSE3-NEXT:    movw %dx, 3(%rdi)
2690; SSSE3-NEXT:    shrl $16, %eax
2691; SSSE3-NEXT:    movb %al, 2(%rdi)
2692; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2693; SSSE3-NEXT:    movd %xmm1, %eax
2694; SSSE3-NEXT:    movw %ax, 9(%rdi)
2695; SSSE3-NEXT:    shrl $16, %ecx
2696; SSSE3-NEXT:    movb %cl, 8(%rdi)
2697; SSSE3-NEXT:    shrl $16, %edx
2698; SSSE3-NEXT:    movb %dl, 5(%rdi)
2699; SSSE3-NEXT:    shrl $16, %eax
2700; SSSE3-NEXT:    movb %al, 11(%rdi)
2701; SSSE3-NEXT:    retq
2702;
2703; SSE41-LABEL: umulo_v4i24:
2704; SSE41:       # %bb.0:
2705; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2706; SSE41-NEXT:    pand %xmm2, %xmm0
2707; SSE41-NEXT:    pand %xmm2, %xmm1
2708; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2709; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2710; SSE41-NEXT:    pmuludq %xmm2, %xmm3
2711; SSE41-NEXT:    movdqa %xmm0, %xmm2
2712; SSE41-NEXT:    pmuludq %xmm1, %xmm2
2713; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2714; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
2715; SSE41-NEXT:    pxor %xmm3, %xmm3
2716; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
2717; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
2718; SSE41-NEXT:    pxor %xmm2, %xmm4
2719; SSE41-NEXT:    pmulld %xmm0, %xmm1
2720; SSE41-NEXT:    pextrd $3, %xmm1, %eax
2721; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
2722; SSE41-NEXT:    pextrd $1, %xmm1, %edx
2723; SSE41-NEXT:    movd %xmm1, %esi
2724; SSE41-NEXT:    movdqa %xmm1, %xmm0
2725; SSE41-NEXT:    psrld $24, %xmm0
2726; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
2727; SSE41-NEXT:    por %xmm4, %xmm0
2728; SSE41-NEXT:    movw %ax, 9(%rdi)
2729; SSE41-NEXT:    movw %cx, 6(%rdi)
2730; SSE41-NEXT:    movw %dx, 3(%rdi)
2731; SSE41-NEXT:    movw %si, (%rdi)
2732; SSE41-NEXT:    shrl $16, %eax
2733; SSE41-NEXT:    movb %al, 11(%rdi)
2734; SSE41-NEXT:    shrl $16, %ecx
2735; SSE41-NEXT:    movb %cl, 8(%rdi)
2736; SSE41-NEXT:    shrl $16, %edx
2737; SSE41-NEXT:    movb %dl, 5(%rdi)
2738; SSE41-NEXT:    shrl $16, %esi
2739; SSE41-NEXT:    movb %sil, 2(%rdi)
2740; SSE41-NEXT:    retq
2741;
2742; AVX1-LABEL: umulo_v4i24:
2743; AVX1:       # %bb.0:
2744; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2745; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
2746; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
2747; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3]
2748; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3]
2749; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2750; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2751; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2752; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
2753; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2754; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2755; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2756; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2757; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2758; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm0
2759; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2760; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
2761; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
2762; AVX1-NEXT:    movw %ax, 9(%rdi)
2763; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
2764; AVX1-NEXT:    movw %cx, 6(%rdi)
2765; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
2766; AVX1-NEXT:    movw %dx, 3(%rdi)
2767; AVX1-NEXT:    vmovd %xmm1, %esi
2768; AVX1-NEXT:    movw %si, (%rdi)
2769; AVX1-NEXT:    shrl $16, %eax
2770; AVX1-NEXT:    movb %al, 11(%rdi)
2771; AVX1-NEXT:    shrl $16, %ecx
2772; AVX1-NEXT:    movb %cl, 8(%rdi)
2773; AVX1-NEXT:    shrl $16, %edx
2774; AVX1-NEXT:    movb %dl, 5(%rdi)
2775; AVX1-NEXT:    shrl $16, %esi
2776; AVX1-NEXT:    movb %sil, 2(%rdi)
2777; AVX1-NEXT:    retq
2778;
2779; AVX2-LABEL: umulo_v4i24:
2780; AVX2:       # %bb.0:
2781; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2782; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
2783; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
2784; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2785; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2786; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2787; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2788; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2789; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
2790; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2791; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2792; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2793; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2794; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2795; AVX2-NEXT:    vpsrld $24, %xmm1, %xmm0
2796; AVX2-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2797; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
2798; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2799; AVX2-NEXT:    movw %ax, 9(%rdi)
2800; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
2801; AVX2-NEXT:    movw %cx, 6(%rdi)
2802; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
2803; AVX2-NEXT:    movw %dx, 3(%rdi)
2804; AVX2-NEXT:    vmovd %xmm1, %esi
2805; AVX2-NEXT:    movw %si, (%rdi)
2806; AVX2-NEXT:    shrl $16, %eax
2807; AVX2-NEXT:    movb %al, 11(%rdi)
2808; AVX2-NEXT:    shrl $16, %ecx
2809; AVX2-NEXT:    movb %cl, 8(%rdi)
2810; AVX2-NEXT:    shrl $16, %edx
2811; AVX2-NEXT:    movb %dl, 5(%rdi)
2812; AVX2-NEXT:    shrl $16, %esi
2813; AVX2-NEXT:    movb %sil, 2(%rdi)
2814; AVX2-NEXT:    retq
2815;
2816; AVX512-LABEL: umulo_v4i24:
2817; AVX512:       # %bb.0:
2818; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2819; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
2820; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
2821; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
2822; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2823; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2824; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
2825; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
2826; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
2827; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2828; AVX512-NEXT:    vpsrld $24, %xmm1, %xmm0
2829; AVX512-NEXT:    vpor %xmm4, %xmm0, %xmm0
2830; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
2831; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2832; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2833; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
2834; AVX512-NEXT:    movw %ax, 9(%rdi)
2835; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
2836; AVX512-NEXT:    movw %cx, 6(%rdi)
2837; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
2838; AVX512-NEXT:    movw %dx, 3(%rdi)
2839; AVX512-NEXT:    vmovd %xmm1, %esi
2840; AVX512-NEXT:    movw %si, (%rdi)
2841; AVX512-NEXT:    shrl $16, %eax
2842; AVX512-NEXT:    movb %al, 11(%rdi)
2843; AVX512-NEXT:    shrl $16, %ecx
2844; AVX512-NEXT:    movb %cl, 8(%rdi)
2845; AVX512-NEXT:    shrl $16, %edx
2846; AVX512-NEXT:    movb %dl, 5(%rdi)
2847; AVX512-NEXT:    shrl $16, %esi
2848; AVX512-NEXT:    movb %sil, 2(%rdi)
2849; AVX512-NEXT:    retq
2850  %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
2851  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
2852  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
2853  %res = sext <4 x i1> %obit to <4 x i32>
2854  store <4 x i24> %val, ptr %p2
2855  ret <4 x i32> %res
2856}
2857
2858define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
2859; SSE-LABEL: umulo_v4i1:
2860; SSE:       # %bb.0:
2861; SSE-NEXT:    pand %xmm1, %xmm0
2862; SSE-NEXT:    pslld $31, %xmm0
2863; SSE-NEXT:    movmskps %xmm0, %eax
2864; SSE-NEXT:    movb %al, (%rdi)
2865; SSE-NEXT:    xorps %xmm0, %xmm0
2866; SSE-NEXT:    retq
2867;
2868; AVX-LABEL: umulo_v4i1:
2869; AVX:       # %bb.0:
2870; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
2871; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
2872; AVX-NEXT:    vmovmskps %xmm0, %eax
2873; AVX-NEXT:    movb %al, (%rdi)
2874; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2875; AVX-NEXT:    retq
2876;
2877; AVX512F-LABEL: umulo_v4i1:
2878; AVX512F:       # %bb.0:
2879; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
2880; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
2881; AVX512F-NEXT:    vptestmd %xmm0, %xmm0, %k0
2882; AVX512F-NEXT:    kmovw %k0, %eax
2883; AVX512F-NEXT:    movb %al, (%rdi)
2884; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2885; AVX512F-NEXT:    retq
2886;
2887; AVX512BW-LABEL: umulo_v4i1:
2888; AVX512BW:       # %bb.0:
2889; AVX512BW-NEXT:    vpand %xmm1, %xmm0, %xmm0
2890; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
2891; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k0
2892; AVX512BW-NEXT:    kmovd %k0, %eax
2893; AVX512BW-NEXT:    movb %al, (%rdi)
2894; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2895; AVX512BW-NEXT:    retq
2896  %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
2897  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
2898  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
2899  %res = sext <4 x i1> %obit to <4 x i32>
2900  store <4 x i1> %val, ptr %p2
2901  ret <4 x i32> %res
2902}
2903
2904define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind {
2905; SSE2-LABEL: umulo_v2i128:
2906; SSE2:       # %bb.0:
2907; SSE2-NEXT:    pushq %rbp
2908; SSE2-NEXT:    pushq %r15
2909; SSE2-NEXT:    pushq %r14
2910; SSE2-NEXT:    pushq %r13
2911; SSE2-NEXT:    pushq %r12
2912; SSE2-NEXT:    pushq %rbx
2913; SSE2-NEXT:    movq %r9, %r10
2914; SSE2-NEXT:    movq %rcx, %r12
2915; SSE2-NEXT:    movq %rdx, %r11
2916; SSE2-NEXT:    movq %rsi, %rax
2917; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
2918; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
2919; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
2920; SSE2-NEXT:    testq %r10, %r10
2921; SSE2-NEXT:    setne %dl
2922; SSE2-NEXT:    testq %rsi, %rsi
2923; SSE2-NEXT:    setne %bpl
2924; SSE2-NEXT:    andb %dl, %bpl
2925; SSE2-NEXT:    mulq %r8
2926; SSE2-NEXT:    movq %rax, %rsi
2927; SSE2-NEXT:    seto %bl
2928; SSE2-NEXT:    movq %r10, %rax
2929; SSE2-NEXT:    mulq %rdi
2930; SSE2-NEXT:    seto %cl
2931; SSE2-NEXT:    orb %bl, %cl
2932; SSE2-NEXT:    leaq (%rsi,%rax), %rbx
2933; SSE2-NEXT:    movq %rdi, %rax
2934; SSE2-NEXT:    mulq %r8
2935; SSE2-NEXT:    movq %rax, %rdi
2936; SSE2-NEXT:    movq %rdx, %rsi
2937; SSE2-NEXT:    addq %rbx, %rsi
2938; SSE2-NEXT:    setb %r13b
2939; SSE2-NEXT:    orb %cl, %r13b
2940; SSE2-NEXT:    orb %bpl, %r13b
2941; SSE2-NEXT:    testq %r9, %r9
2942; SSE2-NEXT:    setne %al
2943; SSE2-NEXT:    testq %r12, %r12
2944; SSE2-NEXT:    setne %r10b
2945; SSE2-NEXT:    andb %al, %r10b
2946; SSE2-NEXT:    movq %r12, %rax
2947; SSE2-NEXT:    mulq %r14
2948; SSE2-NEXT:    movq %rax, %rbp
2949; SSE2-NEXT:    seto %r8b
2950; SSE2-NEXT:    movq %r9, %rax
2951; SSE2-NEXT:    mulq %r11
2952; SSE2-NEXT:    seto %cl
2953; SSE2-NEXT:    orb %r8b, %cl
2954; SSE2-NEXT:    addq %rax, %rbp
2955; SSE2-NEXT:    movq %r11, %rax
2956; SSE2-NEXT:    mulq %r14
2957; SSE2-NEXT:    addq %rbp, %rdx
2958; SSE2-NEXT:    setb %bl
2959; SSE2-NEXT:    orb %cl, %bl
2960; SSE2-NEXT:    orb %r10b, %bl
2961; SSE2-NEXT:    movzbl %bl, %ecx
2962; SSE2-NEXT:    negl %ecx
2963; SSE2-NEXT:    movd %ecx, %xmm1
2964; SSE2-NEXT:    movzbl %r13b, %ecx
2965; SSE2-NEXT:    negl %ecx
2966; SSE2-NEXT:    movd %ecx, %xmm0
2967; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2968; SSE2-NEXT:    movq %rax, 16(%r15)
2969; SSE2-NEXT:    movq %rdi, (%r15)
2970; SSE2-NEXT:    movq %rdx, 24(%r15)
2971; SSE2-NEXT:    movq %rsi, 8(%r15)
2972; SSE2-NEXT:    popq %rbx
2973; SSE2-NEXT:    popq %r12
2974; SSE2-NEXT:    popq %r13
2975; SSE2-NEXT:    popq %r14
2976; SSE2-NEXT:    popq %r15
2977; SSE2-NEXT:    popq %rbp
2978; SSE2-NEXT:    retq
2979;
2980; SSSE3-LABEL: umulo_v2i128:
2981; SSSE3:       # %bb.0:
2982; SSSE3-NEXT:    pushq %rbp
2983; SSSE3-NEXT:    pushq %r15
2984; SSSE3-NEXT:    pushq %r14
2985; SSSE3-NEXT:    pushq %r13
2986; SSSE3-NEXT:    pushq %r12
2987; SSSE3-NEXT:    pushq %rbx
2988; SSSE3-NEXT:    movq %r9, %r10
2989; SSSE3-NEXT:    movq %rcx, %r12
2990; SSSE3-NEXT:    movq %rdx, %r11
2991; SSSE3-NEXT:    movq %rsi, %rax
2992; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
2993; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
2994; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r9
2995; SSSE3-NEXT:    testq %r10, %r10
2996; SSSE3-NEXT:    setne %dl
2997; SSSE3-NEXT:    testq %rsi, %rsi
2998; SSSE3-NEXT:    setne %bpl
2999; SSSE3-NEXT:    andb %dl, %bpl
3000; SSSE3-NEXT:    mulq %r8
3001; SSSE3-NEXT:    movq %rax, %rsi
3002; SSSE3-NEXT:    seto %bl
3003; SSSE3-NEXT:    movq %r10, %rax
3004; SSSE3-NEXT:    mulq %rdi
3005; SSSE3-NEXT:    seto %cl
3006; SSSE3-NEXT:    orb %bl, %cl
3007; SSSE3-NEXT:    leaq (%rsi,%rax), %rbx
3008; SSSE3-NEXT:    movq %rdi, %rax
3009; SSSE3-NEXT:    mulq %r8
3010; SSSE3-NEXT:    movq %rax, %rdi
3011; SSSE3-NEXT:    movq %rdx, %rsi
3012; SSSE3-NEXT:    addq %rbx, %rsi
3013; SSSE3-NEXT:    setb %r13b
3014; SSSE3-NEXT:    orb %cl, %r13b
3015; SSSE3-NEXT:    orb %bpl, %r13b
3016; SSSE3-NEXT:    testq %r9, %r9
3017; SSSE3-NEXT:    setne %al
3018; SSSE3-NEXT:    testq %r12, %r12
3019; SSSE3-NEXT:    setne %r10b
3020; SSSE3-NEXT:    andb %al, %r10b
3021; SSSE3-NEXT:    movq %r12, %rax
3022; SSSE3-NEXT:    mulq %r14
3023; SSSE3-NEXT:    movq %rax, %rbp
3024; SSSE3-NEXT:    seto %r8b
3025; SSSE3-NEXT:    movq %r9, %rax
3026; SSSE3-NEXT:    mulq %r11
3027; SSSE3-NEXT:    seto %cl
3028; SSSE3-NEXT:    orb %r8b, %cl
3029; SSSE3-NEXT:    addq %rax, %rbp
3030; SSSE3-NEXT:    movq %r11, %rax
3031; SSSE3-NEXT:    mulq %r14
3032; SSSE3-NEXT:    addq %rbp, %rdx
3033; SSSE3-NEXT:    setb %bl
3034; SSSE3-NEXT:    orb %cl, %bl
3035; SSSE3-NEXT:    orb %r10b, %bl
3036; SSSE3-NEXT:    movzbl %bl, %ecx
3037; SSSE3-NEXT:    negl %ecx
3038; SSSE3-NEXT:    movd %ecx, %xmm1
3039; SSSE3-NEXT:    movzbl %r13b, %ecx
3040; SSSE3-NEXT:    negl %ecx
3041; SSSE3-NEXT:    movd %ecx, %xmm0
3042; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3043; SSSE3-NEXT:    movq %rax, 16(%r15)
3044; SSSE3-NEXT:    movq %rdi, (%r15)
3045; SSSE3-NEXT:    movq %rdx, 24(%r15)
3046; SSSE3-NEXT:    movq %rsi, 8(%r15)
3047; SSSE3-NEXT:    popq %rbx
3048; SSSE3-NEXT:    popq %r12
3049; SSSE3-NEXT:    popq %r13
3050; SSSE3-NEXT:    popq %r14
3051; SSSE3-NEXT:    popq %r15
3052; SSSE3-NEXT:    popq %rbp
3053; SSSE3-NEXT:    retq
3054;
3055; SSE41-LABEL: umulo_v2i128:
3056; SSE41:       # %bb.0:
3057; SSE41-NEXT:    pushq %rbp
3058; SSE41-NEXT:    pushq %r15
3059; SSE41-NEXT:    pushq %r14
3060; SSE41-NEXT:    pushq %r13
3061; SSE41-NEXT:    pushq %r12
3062; SSE41-NEXT:    pushq %rbx
3063; SSE41-NEXT:    movq %r9, %r10
3064; SSE41-NEXT:    movq %rcx, %r12
3065; SSE41-NEXT:    movq %rdx, %r11
3066; SSE41-NEXT:    movq %rsi, %rax
3067; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3068; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3069; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3070; SSE41-NEXT:    testq %r10, %r10
3071; SSE41-NEXT:    setne %dl
3072; SSE41-NEXT:    testq %rsi, %rsi
3073; SSE41-NEXT:    setne %bpl
3074; SSE41-NEXT:    andb %dl, %bpl
3075; SSE41-NEXT:    mulq %r8
3076; SSE41-NEXT:    movq %rax, %rsi
3077; SSE41-NEXT:    seto %bl
3078; SSE41-NEXT:    movq %r10, %rax
3079; SSE41-NEXT:    mulq %rdi
3080; SSE41-NEXT:    seto %cl
3081; SSE41-NEXT:    orb %bl, %cl
3082; SSE41-NEXT:    leaq (%rsi,%rax), %rbx
3083; SSE41-NEXT:    movq %rdi, %rax
3084; SSE41-NEXT:    mulq %r8
3085; SSE41-NEXT:    movq %rax, %rdi
3086; SSE41-NEXT:    movq %rdx, %rsi
3087; SSE41-NEXT:    addq %rbx, %rsi
3088; SSE41-NEXT:    setb %r13b
3089; SSE41-NEXT:    orb %cl, %r13b
3090; SSE41-NEXT:    orb %bpl, %r13b
3091; SSE41-NEXT:    testq %r9, %r9
3092; SSE41-NEXT:    setne %al
3093; SSE41-NEXT:    testq %r12, %r12
3094; SSE41-NEXT:    setne %r10b
3095; SSE41-NEXT:    andb %al, %r10b
3096; SSE41-NEXT:    movq %r12, %rax
3097; SSE41-NEXT:    mulq %r14
3098; SSE41-NEXT:    movq %rax, %rbp
3099; SSE41-NEXT:    seto %r8b
3100; SSE41-NEXT:    movq %r9, %rax
3101; SSE41-NEXT:    mulq %r11
3102; SSE41-NEXT:    seto %cl
3103; SSE41-NEXT:    orb %r8b, %cl
3104; SSE41-NEXT:    addq %rax, %rbp
3105; SSE41-NEXT:    movq %r11, %rax
3106; SSE41-NEXT:    mulq %r14
3107; SSE41-NEXT:    addq %rbp, %rdx
3108; SSE41-NEXT:    setb %bl
3109; SSE41-NEXT:    orb %cl, %bl
3110; SSE41-NEXT:    orb %r10b, %bl
3111; SSE41-NEXT:    movzbl %bl, %ecx
3112; SSE41-NEXT:    negl %ecx
3113; SSE41-NEXT:    movzbl %r13b, %ebp
3114; SSE41-NEXT:    negl %ebp
3115; SSE41-NEXT:    movd %ebp, %xmm0
3116; SSE41-NEXT:    pinsrd $1, %ecx, %xmm0
3117; SSE41-NEXT:    movq %rax, 16(%r15)
3118; SSE41-NEXT:    movq %rdi, (%r15)
3119; SSE41-NEXT:    movq %rdx, 24(%r15)
3120; SSE41-NEXT:    movq %rsi, 8(%r15)
3121; SSE41-NEXT:    popq %rbx
3122; SSE41-NEXT:    popq %r12
3123; SSE41-NEXT:    popq %r13
3124; SSE41-NEXT:    popq %r14
3125; SSE41-NEXT:    popq %r15
3126; SSE41-NEXT:    popq %rbp
3127; SSE41-NEXT:    retq
3128;
3129; AVX-LABEL: umulo_v2i128:
3130; AVX:       # %bb.0:
3131; AVX-NEXT:    pushq %rbp
3132; AVX-NEXT:    pushq %r15
3133; AVX-NEXT:    pushq %r14
3134; AVX-NEXT:    pushq %r13
3135; AVX-NEXT:    pushq %r12
3136; AVX-NEXT:    pushq %rbx
3137; AVX-NEXT:    movq %r9, %r10
3138; AVX-NEXT:    movq %rcx, %r12
3139; AVX-NEXT:    movq %rdx, %r11
3140; AVX-NEXT:    movq %rsi, %rax
3141; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3142; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3143; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3144; AVX-NEXT:    testq %r10, %r10
3145; AVX-NEXT:    setne %dl
3146; AVX-NEXT:    testq %rsi, %rsi
3147; AVX-NEXT:    setne %bpl
3148; AVX-NEXT:    andb %dl, %bpl
3149; AVX-NEXT:    mulq %r8
3150; AVX-NEXT:    movq %rax, %rsi
3151; AVX-NEXT:    seto %bl
3152; AVX-NEXT:    movq %r10, %rax
3153; AVX-NEXT:    mulq %rdi
3154; AVX-NEXT:    seto %cl
3155; AVX-NEXT:    orb %bl, %cl
3156; AVX-NEXT:    leaq (%rsi,%rax), %rbx
3157; AVX-NEXT:    movq %rdi, %rax
3158; AVX-NEXT:    mulq %r8
3159; AVX-NEXT:    movq %rax, %rdi
3160; AVX-NEXT:    movq %rdx, %rsi
3161; AVX-NEXT:    addq %rbx, %rsi
3162; AVX-NEXT:    setb %r13b
3163; AVX-NEXT:    orb %cl, %r13b
3164; AVX-NEXT:    orb %bpl, %r13b
3165; AVX-NEXT:    testq %r9, %r9
3166; AVX-NEXT:    setne %al
3167; AVX-NEXT:    testq %r12, %r12
3168; AVX-NEXT:    setne %r10b
3169; AVX-NEXT:    andb %al, %r10b
3170; AVX-NEXT:    movq %r12, %rax
3171; AVX-NEXT:    mulq %r14
3172; AVX-NEXT:    movq %rax, %rbp
3173; AVX-NEXT:    seto %r8b
3174; AVX-NEXT:    movq %r9, %rax
3175; AVX-NEXT:    mulq %r11
3176; AVX-NEXT:    seto %cl
3177; AVX-NEXT:    orb %r8b, %cl
3178; AVX-NEXT:    addq %rax, %rbp
3179; AVX-NEXT:    movq %r11, %rax
3180; AVX-NEXT:    mulq %r14
3181; AVX-NEXT:    addq %rbp, %rdx
3182; AVX-NEXT:    setb %bl
3183; AVX-NEXT:    orb %cl, %bl
3184; AVX-NEXT:    orb %r10b, %bl
3185; AVX-NEXT:    movzbl %bl, %ecx
3186; AVX-NEXT:    negl %ecx
3187; AVX-NEXT:    movzbl %r13b, %ebp
3188; AVX-NEXT:    negl %ebp
3189; AVX-NEXT:    vmovd %ebp, %xmm0
3190; AVX-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
3191; AVX-NEXT:    movq %rax, 16(%r15)
3192; AVX-NEXT:    movq %rdi, (%r15)
3193; AVX-NEXT:    movq %rdx, 24(%r15)
3194; AVX-NEXT:    movq %rsi, 8(%r15)
3195; AVX-NEXT:    popq %rbx
3196; AVX-NEXT:    popq %r12
3197; AVX-NEXT:    popq %r13
3198; AVX-NEXT:    popq %r14
3199; AVX-NEXT:    popq %r15
3200; AVX-NEXT:    popq %rbp
3201; AVX-NEXT:    retq
3202;
3203; AVX512F-LABEL: umulo_v2i128:
3204; AVX512F:       # %bb.0:
3205; AVX512F-NEXT:    pushq %rbp
3206; AVX512F-NEXT:    pushq %r15
3207; AVX512F-NEXT:    pushq %r14
3208; AVX512F-NEXT:    pushq %r12
3209; AVX512F-NEXT:    pushq %rbx
3210; AVX512F-NEXT:    movq %rcx, %rax
3211; AVX512F-NEXT:    movq %rdx, %r12
3212; AVX512F-NEXT:    movq %rdi, %r11
3213; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3214; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3215; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3216; AVX512F-NEXT:    testq %r10, %r10
3217; AVX512F-NEXT:    setne %dl
3218; AVX512F-NEXT:    testq %rcx, %rcx
3219; AVX512F-NEXT:    setne %bl
3220; AVX512F-NEXT:    andb %dl, %bl
3221; AVX512F-NEXT:    mulq %r15
3222; AVX512F-NEXT:    movq %rax, %rdi
3223; AVX512F-NEXT:    seto %bpl
3224; AVX512F-NEXT:    movq %r10, %rax
3225; AVX512F-NEXT:    mulq %r12
3226; AVX512F-NEXT:    seto %cl
3227; AVX512F-NEXT:    orb %bpl, %cl
3228; AVX512F-NEXT:    leaq (%rdi,%rax), %rbp
3229; AVX512F-NEXT:    movq %r12, %rax
3230; AVX512F-NEXT:    mulq %r15
3231; AVX512F-NEXT:    movq %rax, %r10
3232; AVX512F-NEXT:    movq %rdx, %rdi
3233; AVX512F-NEXT:    addq %rbp, %rdi
3234; AVX512F-NEXT:    setb %al
3235; AVX512F-NEXT:    orb %cl, %al
3236; AVX512F-NEXT:    orb %bl, %al
3237; AVX512F-NEXT:    kmovw %eax, %k0
3238; AVX512F-NEXT:    testq %r9, %r9
3239; AVX512F-NEXT:    setne %al
3240; AVX512F-NEXT:    testq %rsi, %rsi
3241; AVX512F-NEXT:    setne %cl
3242; AVX512F-NEXT:    andb %al, %cl
3243; AVX512F-NEXT:    movq %rsi, %rax
3244; AVX512F-NEXT:    mulq %r8
3245; AVX512F-NEXT:    movq %rax, %rsi
3246; AVX512F-NEXT:    seto %bpl
3247; AVX512F-NEXT:    movq %r9, %rax
3248; AVX512F-NEXT:    mulq %r11
3249; AVX512F-NEXT:    seto %bl
3250; AVX512F-NEXT:    orb %bpl, %bl
3251; AVX512F-NEXT:    addq %rax, %rsi
3252; AVX512F-NEXT:    movq %r11, %rax
3253; AVX512F-NEXT:    mulq %r8
3254; AVX512F-NEXT:    addq %rsi, %rdx
3255; AVX512F-NEXT:    setb %sil
3256; AVX512F-NEXT:    orb %bl, %sil
3257; AVX512F-NEXT:    orb %cl, %sil
3258; AVX512F-NEXT:    andl $1, %esi
3259; AVX512F-NEXT:    kmovw %esi, %k1
3260; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
3261; AVX512F-NEXT:    korw %k0, %k1, %k1
3262; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3263; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3264; AVX512F-NEXT:    movq %r10, 16(%r14)
3265; AVX512F-NEXT:    movq %rax, (%r14)
3266; AVX512F-NEXT:    movq %rdi, 24(%r14)
3267; AVX512F-NEXT:    movq %rdx, 8(%r14)
3268; AVX512F-NEXT:    popq %rbx
3269; AVX512F-NEXT:    popq %r12
3270; AVX512F-NEXT:    popq %r14
3271; AVX512F-NEXT:    popq %r15
3272; AVX512F-NEXT:    popq %rbp
3273; AVX512F-NEXT:    retq
3274;
3275; AVX512BW-LABEL: umulo_v2i128:
3276; AVX512BW:       # %bb.0:
3277; AVX512BW-NEXT:    pushq %rbp
3278; AVX512BW-NEXT:    pushq %r15
3279; AVX512BW-NEXT:    pushq %r14
3280; AVX512BW-NEXT:    pushq %r12
3281; AVX512BW-NEXT:    pushq %rbx
3282; AVX512BW-NEXT:    movq %rcx, %rax
3283; AVX512BW-NEXT:    movq %rdx, %r12
3284; AVX512BW-NEXT:    movq %rdi, %r11
3285; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3286; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3287; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3288; AVX512BW-NEXT:    testq %r10, %r10
3289; AVX512BW-NEXT:    setne %dl
3290; AVX512BW-NEXT:    testq %rcx, %rcx
3291; AVX512BW-NEXT:    setne %bl
3292; AVX512BW-NEXT:    andb %dl, %bl
3293; AVX512BW-NEXT:    mulq %r15
3294; AVX512BW-NEXT:    movq %rax, %rdi
3295; AVX512BW-NEXT:    seto %bpl
3296; AVX512BW-NEXT:    movq %r10, %rax
3297; AVX512BW-NEXT:    mulq %r12
3298; AVX512BW-NEXT:    seto %cl
3299; AVX512BW-NEXT:    orb %bpl, %cl
3300; AVX512BW-NEXT:    leaq (%rdi,%rax), %rbp
3301; AVX512BW-NEXT:    movq %r12, %rax
3302; AVX512BW-NEXT:    mulq %r15
3303; AVX512BW-NEXT:    movq %rax, %r10
3304; AVX512BW-NEXT:    movq %rdx, %rdi
3305; AVX512BW-NEXT:    addq %rbp, %rdi
3306; AVX512BW-NEXT:    setb %al
3307; AVX512BW-NEXT:    orb %cl, %al
3308; AVX512BW-NEXT:    orb %bl, %al
3309; AVX512BW-NEXT:    kmovd %eax, %k0
3310; AVX512BW-NEXT:    testq %r9, %r9
3311; AVX512BW-NEXT:    setne %al
3312; AVX512BW-NEXT:    testq %rsi, %rsi
3313; AVX512BW-NEXT:    setne %cl
3314; AVX512BW-NEXT:    andb %al, %cl
3315; AVX512BW-NEXT:    movq %rsi, %rax
3316; AVX512BW-NEXT:    mulq %r8
3317; AVX512BW-NEXT:    movq %rax, %rsi
3318; AVX512BW-NEXT:    seto %bpl
3319; AVX512BW-NEXT:    movq %r9, %rax
3320; AVX512BW-NEXT:    mulq %r11
3321; AVX512BW-NEXT:    seto %bl
3322; AVX512BW-NEXT:    orb %bpl, %bl
3323; AVX512BW-NEXT:    addq %rax, %rsi
3324; AVX512BW-NEXT:    movq %r11, %rax
3325; AVX512BW-NEXT:    mulq %r8
3326; AVX512BW-NEXT:    addq %rsi, %rdx
3327; AVX512BW-NEXT:    setb %sil
3328; AVX512BW-NEXT:    orb %bl, %sil
3329; AVX512BW-NEXT:    orb %cl, %sil
3330; AVX512BW-NEXT:    andl $1, %esi
3331; AVX512BW-NEXT:    kmovw %esi, %k1
3332; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
3333; AVX512BW-NEXT:    korw %k0, %k1, %k1
3334; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3335; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3336; AVX512BW-NEXT:    movq %r10, 16(%r14)
3337; AVX512BW-NEXT:    movq %rax, (%r14)
3338; AVX512BW-NEXT:    movq %rdi, 24(%r14)
3339; AVX512BW-NEXT:    movq %rdx, 8(%r14)
3340; AVX512BW-NEXT:    popq %rbx
3341; AVX512BW-NEXT:    popq %r12
3342; AVX512BW-NEXT:    popq %r14
3343; AVX512BW-NEXT:    popq %r15
3344; AVX512BW-NEXT:    popq %rbp
3345; AVX512BW-NEXT:    retq
3346  %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
3347  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
3348  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
3349  %res = sext <2 x i1> %obit to <2 x i32>
3350  store <2 x i128> %val, ptr %p2
3351  ret <2 x i32> %res
3352}
3353