1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW
11
12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>)
13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>)
14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>)
15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>)
16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>)
17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>)
18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>)
19
20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>)
21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>)
22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>)
23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>)
24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>)
25
26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>)
27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>)
28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>)
29
30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind {
31; CHECK-LABEL: umulo_v1i32:
32; CHECK:       # %bb.0:
33; CHECK-NEXT:    movq %rdx, %rcx
34; CHECK-NEXT:    movl %edi, %eax
35; CHECK-NEXT:    xorl %edi, %edi
36; CHECK-NEXT:    mull %esi
37; CHECK-NEXT:    seto %dil
38; CHECK-NEXT:    negl %edi
39; CHECK-NEXT:    movl %eax, (%rcx)
40; CHECK-NEXT:    movl %edi, %eax
41; CHECK-NEXT:    retq
42  %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1)
43  %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0
44  %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1
45  %res = sext <1 x i1> %obit to <1 x i32>
46  store <1 x i32> %val, <1 x i32>* %p2
47  ret <1 x i32> %res
48}
49
50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind {
51; SSE2-LABEL: umulo_v2i32:
52; SSE2:       # %bb.0:
53; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
54; SSE2-NEXT:    pmuludq %xmm1, %xmm0
55; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
56; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
57; SSE2-NEXT:    pmuludq %xmm2, %xmm4
58; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
59; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
60; SSE2-NEXT:    pxor %xmm2, %xmm2
61; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
62; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
63; SSE2-NEXT:    pxor %xmm2, %xmm1
64; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
65; SSE2-NEXT:    movq %xmm0, (%rdi)
66; SSE2-NEXT:    movdqa %xmm1, %xmm0
67; SSE2-NEXT:    retq
68;
69; SSSE3-LABEL: umulo_v2i32:
70; SSSE3:       # %bb.0:
71; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
72; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
73; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
74; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
75; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
76; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
77; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
78; SSSE3-NEXT:    pxor %xmm2, %xmm2
79; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
80; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
81; SSSE3-NEXT:    pxor %xmm2, %xmm1
82; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
83; SSSE3-NEXT:    movq %xmm0, (%rdi)
84; SSSE3-NEXT:    movdqa %xmm1, %xmm0
85; SSSE3-NEXT:    retq
86;
87; SSE41-LABEL: umulo_v2i32:
88; SSE41:       # %bb.0:
89; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
90; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
91; SSE41-NEXT:    pmuludq %xmm2, %xmm3
92; SSE41-NEXT:    movdqa %xmm0, %xmm2
93; SSE41-NEXT:    pmuludq %xmm1, %xmm2
94; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
95; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
96; SSE41-NEXT:    pxor %xmm3, %xmm3
97; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
98; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
99; SSE41-NEXT:    pxor %xmm3, %xmm2
100; SSE41-NEXT:    pmulld %xmm1, %xmm0
101; SSE41-NEXT:    movq %xmm0, (%rdi)
102; SSE41-NEXT:    movdqa %xmm2, %xmm0
103; SSE41-NEXT:    retq
104;
105; AVX1-LABEL: umulo_v2i32:
106; AVX1:       # %bb.0:
107; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
108; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
109; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
110; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
111; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
112; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
113; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
114; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
115; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
116; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
117; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
118; AVX1-NEXT:    vmovq %xmm0, (%rdi)
119; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
120; AVX1-NEXT:    retq
121;
122; AVX2-LABEL: umulo_v2i32:
123; AVX2:       # %bb.0:
124; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
125; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
126; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
127; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
128; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
129; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
130; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
131; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
132; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
133; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
134; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
135; AVX2-NEXT:    vmovq %xmm0, (%rdi)
136; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
137; AVX2-NEXT:    retq
138;
139; AVX512-LABEL: umulo_v2i32:
140; AVX512:       # %bb.0:
141; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
142; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
143; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
144; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
145; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
146; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
147; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
148; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
149; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
150; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
151; AVX512-NEXT:    vmovq %xmm1, (%rdi)
152; AVX512-NEXT:    retq
153  %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1)
154  %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0
155  %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1
156  %res = sext <2 x i1> %obit to <2 x i32>
157  store <2 x i32> %val, <2 x i32>* %p2
158  ret <2 x i32> %res
159}
160
161define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind {
162; SSE2-LABEL: umulo_v3i32:
163; SSE2:       # %bb.0:
164; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
165; SSE2-NEXT:    pmuludq %xmm1, %xmm0
166; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
167; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
168; SSE2-NEXT:    pmuludq %xmm2, %xmm4
169; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
170; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
171; SSE2-NEXT:    pxor %xmm2, %xmm2
172; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
173; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
174; SSE2-NEXT:    pxor %xmm2, %xmm1
175; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
176; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
177; SSE2-NEXT:    movd %xmm2, 8(%rdi)
178; SSE2-NEXT:    movq %xmm0, (%rdi)
179; SSE2-NEXT:    movdqa %xmm1, %xmm0
180; SSE2-NEXT:    retq
181;
182; SSSE3-LABEL: umulo_v3i32:
183; SSSE3:       # %bb.0:
184; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
185; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
186; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
187; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
188; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
189; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
190; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
191; SSSE3-NEXT:    pxor %xmm2, %xmm2
192; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
193; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
194; SSSE3-NEXT:    pxor %xmm2, %xmm1
195; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
196; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
197; SSSE3-NEXT:    movd %xmm2, 8(%rdi)
198; SSSE3-NEXT:    movq %xmm0, (%rdi)
199; SSSE3-NEXT:    movdqa %xmm1, %xmm0
200; SSSE3-NEXT:    retq
201;
202; SSE41-LABEL: umulo_v3i32:
203; SSE41:       # %bb.0:
204; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
205; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
206; SSE41-NEXT:    pmuludq %xmm2, %xmm3
207; SSE41-NEXT:    movdqa %xmm0, %xmm2
208; SSE41-NEXT:    pmuludq %xmm1, %xmm2
209; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
210; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
211; SSE41-NEXT:    pxor %xmm3, %xmm3
212; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
213; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
214; SSE41-NEXT:    pxor %xmm3, %xmm2
215; SSE41-NEXT:    pmulld %xmm1, %xmm0
216; SSE41-NEXT:    pextrd $2, %xmm0, 8(%rdi)
217; SSE41-NEXT:    movq %xmm0, (%rdi)
218; SSE41-NEXT:    movdqa %xmm2, %xmm0
219; SSE41-NEXT:    retq
220;
221; AVX1-LABEL: umulo_v3i32:
222; AVX1:       # %bb.0:
223; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
224; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
225; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
226; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
227; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
228; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
229; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
230; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
231; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
232; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
233; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
234; AVX1-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
235; AVX1-NEXT:    vmovq %xmm0, (%rdi)
236; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
237; AVX1-NEXT:    retq
238;
239; AVX2-LABEL: umulo_v3i32:
240; AVX2:       # %bb.0:
241; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
242; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
243; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
244; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
245; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
246; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
247; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
248; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
249; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
250; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
251; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
252; AVX2-NEXT:    vpextrd $2, %xmm0, 8(%rdi)
253; AVX2-NEXT:    vmovq %xmm0, (%rdi)
254; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
255; AVX2-NEXT:    retq
256;
257; AVX512-LABEL: umulo_v3i32:
258; AVX512:       # %bb.0:
259; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
260; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
261; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
262; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
263; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
264; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
265; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
266; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
267; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
268; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
269; AVX512-NEXT:    vpextrd $2, %xmm1, 8(%rdi)
270; AVX512-NEXT:    vmovq %xmm1, (%rdi)
271; AVX512-NEXT:    retq
272  %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1)
273  %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0
274  %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1
275  %res = sext <3 x i1> %obit to <3 x i32>
276  store <3 x i32> %val, <3 x i32>* %p2
277  ret <3 x i32> %res
278}
279
280define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind {
281; SSE2-LABEL: umulo_v4i32:
282; SSE2:       # %bb.0:
283; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
284; SSE2-NEXT:    pmuludq %xmm1, %xmm0
285; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
286; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
287; SSE2-NEXT:    pmuludq %xmm2, %xmm4
288; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
289; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
290; SSE2-NEXT:    pxor %xmm2, %xmm2
291; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
292; SSE2-NEXT:    pcmpeqd %xmm1, %xmm1
293; SSE2-NEXT:    pxor %xmm2, %xmm1
294; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
295; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
296; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
297; SSE2-NEXT:    movdqa %xmm0, (%rdi)
298; SSE2-NEXT:    movdqa %xmm1, %xmm0
299; SSE2-NEXT:    retq
300;
301; SSSE3-LABEL: umulo_v4i32:
302; SSSE3:       # %bb.0:
303; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
304; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
305; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
306; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
307; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
308; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3]
309; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
310; SSSE3-NEXT:    pxor %xmm2, %xmm2
311; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
312; SSSE3-NEXT:    pcmpeqd %xmm1, %xmm1
313; SSSE3-NEXT:    pxor %xmm2, %xmm1
314; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
315; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
316; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
317; SSSE3-NEXT:    movdqa %xmm0, (%rdi)
318; SSSE3-NEXT:    movdqa %xmm1, %xmm0
319; SSSE3-NEXT:    retq
320;
321; SSE41-LABEL: umulo_v4i32:
322; SSE41:       # %bb.0:
323; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
324; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
325; SSE41-NEXT:    pmuludq %xmm2, %xmm3
326; SSE41-NEXT:    movdqa %xmm0, %xmm2
327; SSE41-NEXT:    pmuludq %xmm1, %xmm2
328; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
329; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
330; SSE41-NEXT:    pxor %xmm3, %xmm3
331; SSE41-NEXT:    pcmpeqd %xmm2, %xmm3
332; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
333; SSE41-NEXT:    pxor %xmm3, %xmm2
334; SSE41-NEXT:    pmulld %xmm1, %xmm0
335; SSE41-NEXT:    movdqa %xmm0, (%rdi)
336; SSE41-NEXT:    movdqa %xmm2, %xmm0
337; SSE41-NEXT:    retq
338;
339; AVX1-LABEL: umulo_v4i32:
340; AVX1:       # %bb.0:
341; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
342; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
343; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
344; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
345; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
346; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
347; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
348; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
349; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
350; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
351; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
352; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
353; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
354; AVX1-NEXT:    retq
355;
356; AVX2-LABEL: umulo_v4i32:
357; AVX2:       # %bb.0:
358; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
359; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
360; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
361; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
362; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
363; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
364; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
365; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
366; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
367; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
368; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
369; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
370; AVX2-NEXT:    vmovdqa %xmm2, %xmm0
371; AVX2-NEXT:    retq
372;
373; AVX512-LABEL: umulo_v4i32:
374; AVX512:       # %bb.0:
375; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
376; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
377; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
378; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
379; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
380; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
381; AVX512-NEXT:    vptestmd %xmm4, %xmm4, %k1
382; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
383; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
384; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
385; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
386; AVX512-NEXT:    retq
387  %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1)
388  %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0
389  %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1
390  %res = sext <4 x i1> %obit to <4 x i32>
391  store <4 x i32> %val, <4 x i32>* %p2
392  ret <4 x i32> %res
393}
394
395define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind {
396; SSE2-LABEL: umulo_v6i32:
397; SSE2:       # %bb.0:
398; SSE2-NEXT:    movq %rdi, %rax
399; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
400; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
401; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
402; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
403; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
404; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
405; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
406; SSE2-NEXT:    movd %r8d, %xmm0
407; SSE2-NEXT:    movd %ecx, %xmm1
408; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
409; SSE2-NEXT:    movd %edx, %xmm0
410; SSE2-NEXT:    movd %esi, %xmm3
411; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
412; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
413; SSE2-NEXT:    movd %r9d, %xmm1
414; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
415; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
416; SSE2-NEXT:    pmuludq %xmm1, %xmm0
417; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
418; SSE2-NEXT:    pmuludq %xmm2, %xmm3
419; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
420; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
421; SSE2-NEXT:    pmuludq %xmm4, %xmm2
422; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
423; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
424; SSE2-NEXT:    pxor %xmm4, %xmm4
425; SSE2-NEXT:    pcmpeqd %xmm4, %xmm1
426; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
427; SSE2-NEXT:    pxor %xmm5, %xmm1
428; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
429; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
430; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
431; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
432; SSE2-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
433; SSE2-NEXT:    pmuludq %xmm2, %xmm6
434; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
435; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
436; SSE2-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
437; SSE2-NEXT:    pcmpeqd %xmm4, %xmm7
438; SSE2-NEXT:    pxor %xmm5, %xmm7
439; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
440; SSE2-NEXT:    movq %xmm0, 16(%rcx)
441; SSE2-NEXT:    movdqa %xmm3, (%rcx)
442; SSE2-NEXT:    movq %xmm7, 16(%rdi)
443; SSE2-NEXT:    movdqa %xmm1, (%rdi)
444; SSE2-NEXT:    retq
445;
446; SSSE3-LABEL: umulo_v6i32:
447; SSSE3:       # %bb.0:
448; SSSE3-NEXT:    movq %rdi, %rax
449; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
450; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
451; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
452; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
453; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
454; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
455; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
456; SSSE3-NEXT:    movd %r8d, %xmm0
457; SSSE3-NEXT:    movd %ecx, %xmm1
458; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
459; SSSE3-NEXT:    movd %edx, %xmm0
460; SSSE3-NEXT:    movd %esi, %xmm3
461; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
462; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
463; SSSE3-NEXT:    movd %r9d, %xmm1
464; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
465; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
466; SSSE3-NEXT:    pmuludq %xmm1, %xmm0
467; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
468; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
469; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
470; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
471; SSSE3-NEXT:    pmuludq %xmm4, %xmm2
472; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
473; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
474; SSSE3-NEXT:    pxor %xmm4, %xmm4
475; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm1
476; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
477; SSSE3-NEXT:    pxor %xmm5, %xmm1
478; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
479; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
480; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
481; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
482; SSSE3-NEXT:    movd {{.*#+}} xmm6 = mem[0],zero,zero,zero
483; SSSE3-NEXT:    pmuludq %xmm2, %xmm6
484; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
485; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3]
486; SSSE3-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
487; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm7
488; SSSE3-NEXT:    pxor %xmm5, %xmm7
489; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
490; SSSE3-NEXT:    movq %xmm0, 16(%rcx)
491; SSSE3-NEXT:    movdqa %xmm3, (%rcx)
492; SSSE3-NEXT:    movq %xmm7, 16(%rdi)
493; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
494; SSSE3-NEXT:    retq
495;
496; SSE41-LABEL: umulo_v6i32:
497; SSE41:       # %bb.0:
498; SSE41-NEXT:    movq %rdi, %rax
499; SSE41-NEXT:    movd %esi, %xmm2
500; SSE41-NEXT:    pinsrd $1, %edx, %xmm2
501; SSE41-NEXT:    pinsrd $2, %ecx, %xmm2
502; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
503; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm1
504; SSE41-NEXT:    pinsrd $2, {{[0-9]+}}(%rsp), %xmm1
505; SSE41-NEXT:    movdqa %xmm1, %xmm0
506; SSE41-NEXT:    pmuludq %xmm2, %xmm1
507; SSE41-NEXT:    pinsrd $3, %r8d, %xmm2
508; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
509; SSE41-NEXT:    movd %r9d, %xmm4
510; SSE41-NEXT:    movdqa %xmm4, %xmm5
511; SSE41-NEXT:    pmuludq %xmm3, %xmm4
512; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm3
513; SSE41-NEXT:    pinsrd $1, {{[0-9]+}}(%rsp), %xmm5
514; SSE41-NEXT:    pmulld %xmm3, %xmm5
515; SSE41-NEXT:    pinsrd $3, {{[0-9]+}}(%rsp), %xmm0
516; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
517; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
518; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
519; SSE41-NEXT:    pmuludq %xmm3, %xmm6
520; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
521; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7]
522; SSE41-NEXT:    pxor %xmm8, %xmm8
523; SSE41-NEXT:    pcmpeqd %xmm8, %xmm1
524; SSE41-NEXT:    pcmpeqd %xmm6, %xmm6
525; SSE41-NEXT:    pxor %xmm6, %xmm1
526; SSE41-NEXT:    movd {{.*#+}} xmm7 = mem[0],zero,zero,zero
527; SSE41-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
528; SSE41-NEXT:    pmuludq %xmm7, %xmm3
529; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
530; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
531; SSE41-NEXT:    pcmpeqd %xmm8, %xmm4
532; SSE41-NEXT:    pxor %xmm6, %xmm4
533; SSE41-NEXT:    pmulld %xmm2, %xmm0
534; SSE41-NEXT:    movq %xmm5, 16(%rcx)
535; SSE41-NEXT:    movdqa %xmm0, (%rcx)
536; SSE41-NEXT:    movq %xmm4, 16(%rdi)
537; SSE41-NEXT:    movdqa %xmm1, (%rdi)
538; SSE41-NEXT:    retq
539;
540; AVX1-LABEL: umulo_v6i32:
541; AVX1:       # %bb.0:
542; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
543; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
544; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
545; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
546; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
547; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
548; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
549; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
550; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
551; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm8, %xmm2
552; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
553; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
554; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
555; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
556; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
557; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm7
558; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
559; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
560; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
561; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
562; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
563; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
564; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
565; AVX1-NEXT:    vmovq %xmm1, 16(%rdi)
566; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
567; AVX1-NEXT:    vmovaps %ymm2, %ymm0
568; AVX1-NEXT:    retq
569;
570; AVX2-LABEL: umulo_v6i32:
571; AVX2:       # %bb.0:
572; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
573; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
574; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
575; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
576; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
577; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
578; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
579; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
580; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
581; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
582; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
583; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
584; AVX2-NEXT:    vmovq %xmm1, 16(%rdi)
585; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
586; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
587; AVX2-NEXT:    retq
588;
589; AVX512-LABEL: umulo_v6i32:
590; AVX512:       # %bb.0:
591; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
592; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
593; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
594; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
595; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
596; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
597; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
598; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
599; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
600; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
601; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
602; AVX512-NEXT:    vmovq %xmm2, 16(%rdi)
603; AVX512-NEXT:    vmovdqa %xmm1, (%rdi)
604; AVX512-NEXT:    retq
605  %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1)
606  %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0
607  %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1
608  %res = sext <6 x i1> %obit to <6 x i32>
609  store <6 x i32> %val, <6 x i32>* %p2
610  ret <6 x i32> %res
611}
612
613define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind {
614; SSE2-LABEL: umulo_v8i32:
615; SSE2:       # %bb.0:
616; SSE2-NEXT:    movdqa %xmm0, %xmm4
617; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
618; SSE2-NEXT:    pmuludq %xmm2, %xmm4
619; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
620; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
621; SSE2-NEXT:    pmuludq %xmm5, %xmm6
622; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
623; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
624; SSE2-NEXT:    pxor %xmm8, %xmm8
625; SSE2-NEXT:    pcmpeqd %xmm8, %xmm0
626; SSE2-NEXT:    pcmpeqd %xmm7, %xmm7
627; SSE2-NEXT:    pxor %xmm7, %xmm0
628; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
629; SSE2-NEXT:    pmuludq %xmm3, %xmm1
630; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
631; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
632; SSE2-NEXT:    pmuludq %xmm5, %xmm3
633; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
634; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
635; SSE2-NEXT:    pcmpeqd %xmm8, %xmm2
636; SSE2-NEXT:    pxor %xmm7, %xmm2
637; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
638; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
639; SSE2-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
640; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
641; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
642; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
643; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
644; SSE2-NEXT:    movdqa %xmm4, (%rdi)
645; SSE2-NEXT:    movdqa %xmm2, %xmm1
646; SSE2-NEXT:    retq
647;
648; SSSE3-LABEL: umulo_v8i32:
649; SSSE3:       # %bb.0:
650; SSSE3-NEXT:    movdqa %xmm0, %xmm4
651; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
652; SSSE3-NEXT:    pmuludq %xmm2, %xmm4
653; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3]
654; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
655; SSSE3-NEXT:    pmuludq %xmm5, %xmm6
656; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3]
657; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
658; SSSE3-NEXT:    pxor %xmm8, %xmm8
659; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm0
660; SSSE3-NEXT:    pcmpeqd %xmm7, %xmm7
661; SSSE3-NEXT:    pxor %xmm7, %xmm0
662; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
663; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
664; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
665; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
666; SSSE3-NEXT:    pmuludq %xmm5, %xmm3
667; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
668; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
669; SSSE3-NEXT:    pcmpeqd %xmm8, %xmm2
670; SSSE3-NEXT:    pxor %xmm7, %xmm2
671; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
672; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
673; SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
674; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
675; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
676; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
677; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
678; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
679; SSSE3-NEXT:    movdqa %xmm2, %xmm1
680; SSSE3-NEXT:    retq
681;
682; SSE41-LABEL: umulo_v8i32:
683; SSE41:       # %bb.0:
684; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
685; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
686; SSE41-NEXT:    pmuludq %xmm4, %xmm5
687; SSE41-NEXT:    movdqa %xmm0, %xmm4
688; SSE41-NEXT:    pmuludq %xmm2, %xmm4
689; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
690; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7]
691; SSE41-NEXT:    pxor %xmm8, %xmm8
692; SSE41-NEXT:    pcmpeqd %xmm8, %xmm4
693; SSE41-NEXT:    pcmpeqd %xmm7, %xmm7
694; SSE41-NEXT:    pxor %xmm7, %xmm4
695; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
696; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
697; SSE41-NEXT:    pmuludq %xmm5, %xmm6
698; SSE41-NEXT:    movdqa %xmm1, %xmm5
699; SSE41-NEXT:    pmuludq %xmm3, %xmm5
700; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
701; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
702; SSE41-NEXT:    pcmpeqd %xmm8, %xmm5
703; SSE41-NEXT:    pxor %xmm7, %xmm5
704; SSE41-NEXT:    pmulld %xmm2, %xmm0
705; SSE41-NEXT:    pmulld %xmm3, %xmm1
706; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
707; SSE41-NEXT:    movdqa %xmm0, (%rdi)
708; SSE41-NEXT:    movdqa %xmm4, %xmm0
709; SSE41-NEXT:    movdqa %xmm5, %xmm1
710; SSE41-NEXT:    retq
711;
712; AVX1-LABEL: umulo_v8i32:
713; AVX1:       # %bb.0:
714; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
715; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
716; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
717; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
718; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
719; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm5
720; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
721; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
722; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
723; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm8, %xmm2
724; AVX1-NEXT:    vpcmpeqd %xmm6, %xmm6, %xmm6
725; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm2
726; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
727; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
728; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
729; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm7
730; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
731; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
732; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
733; AVX1-NEXT:    vpxor %xmm6, %xmm5, %xmm5
734; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
735; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
736; AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm1
737; AVX1-NEXT:    vmovdqa %xmm1, 16(%rdi)
738; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
739; AVX1-NEXT:    vmovaps %ymm2, %ymm0
740; AVX1-NEXT:    retq
741;
742; AVX2-LABEL: umulo_v8i32:
743; AVX2:       # %bb.0:
744; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7]
745; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
746; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm2
747; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm3
748; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7]
749; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
750; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
751; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm2
752; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm3, %ymm3
753; AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm2
754; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
755; AVX2-NEXT:    vmovdqa %ymm0, (%rdi)
756; AVX2-NEXT:    vmovdqa %ymm2, %ymm0
757; AVX2-NEXT:    retq
758;
759; AVX512-LABEL: umulo_v8i32:
760; AVX512:       # %bb.0:
761; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
762; AVX512-NEXT:    vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7]
763; AVX512-NEXT:    vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7]
764; AVX512-NEXT:    vpmuludq %ymm3, %ymm4, %ymm3
765; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15]
766; AVX512-NEXT:    vpermi2d %ymm3, %ymm2, %ymm4
767; AVX512-NEXT:    vptestmd %ymm4, %ymm4, %k1
768; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm1
769; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
770; AVX512-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
771; AVX512-NEXT:    vmovdqa %ymm1, (%rdi)
772; AVX512-NEXT:    retq
773  %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1)
774  %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0
775  %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1
776  %res = sext <8 x i1> %obit to <8 x i32>
777  store <8 x i32> %val, <8 x i32>* %p2
778  ret <8 x i32> %res
779}
780
781define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind {
782; SSE2-LABEL: umulo_v16i32:
783; SSE2:       # %bb.0:
784; SSE2-NEXT:    movdqa %xmm0, %xmm8
785; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
786; SSE2-NEXT:    pmuludq %xmm4, %xmm8
787; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
788; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
789; SSE2-NEXT:    pmuludq %xmm10, %xmm9
790; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
791; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
792; SSE2-NEXT:    pxor %xmm10, %xmm10
793; SSE2-NEXT:    pcmpeqd %xmm10, %xmm0
794; SSE2-NEXT:    pcmpeqd %xmm11, %xmm11
795; SSE2-NEXT:    pxor %xmm11, %xmm0
796; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
797; SSE2-NEXT:    pmuludq %xmm5, %xmm1
798; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
799; SSE2-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
800; SSE2-NEXT:    pmuludq %xmm13, %xmm12
801; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
802; SSE2-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
803; SSE2-NEXT:    pcmpeqd %xmm10, %xmm15
804; SSE2-NEXT:    pxor %xmm11, %xmm15
805; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
806; SSE2-NEXT:    pmuludq %xmm6, %xmm2
807; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
808; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
809; SSE2-NEXT:    pmuludq %xmm14, %xmm13
810; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
811; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
812; SSE2-NEXT:    pcmpeqd %xmm10, %xmm5
813; SSE2-NEXT:    pxor %xmm11, %xmm5
814; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
815; SSE2-NEXT:    pmuludq %xmm7, %xmm3
816; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
817; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
818; SSE2-NEXT:    pmuludq %xmm14, %xmm7
819; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
820; SSE2-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
821; SSE2-NEXT:    pcmpeqd %xmm10, %xmm6
822; SSE2-NEXT:    pxor %xmm11, %xmm6
823; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
824; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
825; SSE2-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
826; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
827; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
828; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
829; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
830; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
831; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
832; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
833; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
834; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
835; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
836; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
837; SSE2-NEXT:    movdqa %xmm1, 16(%rdi)
838; SSE2-NEXT:    movdqa %xmm8, (%rdi)
839; SSE2-NEXT:    movdqa %xmm15, %xmm1
840; SSE2-NEXT:    movdqa %xmm5, %xmm2
841; SSE2-NEXT:    movdqa %xmm6, %xmm3
842; SSE2-NEXT:    retq
843;
844; SSSE3-LABEL: umulo_v16i32:
845; SSSE3:       # %bb.0:
846; SSSE3-NEXT:    movdqa %xmm0, %xmm8
847; SSSE3-NEXT:    pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
848; SSSE3-NEXT:    pmuludq %xmm4, %xmm8
849; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3]
850; SSSE3-NEXT:    pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3]
851; SSSE3-NEXT:    pmuludq %xmm10, %xmm9
852; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3]
853; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
854; SSSE3-NEXT:    pxor %xmm10, %xmm10
855; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm0
856; SSSE3-NEXT:    pcmpeqd %xmm11, %xmm11
857; SSSE3-NEXT:    pxor %xmm11, %xmm0
858; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
859; SSSE3-NEXT:    pmuludq %xmm5, %xmm1
860; SSSE3-NEXT:    pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3]
861; SSSE3-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3]
862; SSSE3-NEXT:    pmuludq %xmm13, %xmm12
863; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3]
864; SSSE3-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1]
865; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm15
866; SSSE3-NEXT:    pxor %xmm11, %xmm15
867; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3]
868; SSSE3-NEXT:    pmuludq %xmm6, %xmm2
869; SSSE3-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3]
870; SSSE3-NEXT:    pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3]
871; SSSE3-NEXT:    pmuludq %xmm14, %xmm13
872; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3]
873; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
874; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm5
875; SSSE3-NEXT:    pxor %xmm11, %xmm5
876; SSSE3-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
877; SSSE3-NEXT:    pmuludq %xmm7, %xmm3
878; SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3]
879; SSSE3-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
880; SSSE3-NEXT:    pmuludq %xmm14, %xmm7
881; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3]
882; SSSE3-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
883; SSSE3-NEXT:    pcmpeqd %xmm10, %xmm6
884; SSSE3-NEXT:    pxor %xmm11, %xmm6
885; SSSE3-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
886; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
887; SSSE3-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
888; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
889; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3]
890; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
891; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
892; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3]
893; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
894; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
895; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3]
896; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
897; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
898; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
899; SSSE3-NEXT:    movdqa %xmm1, 16(%rdi)
900; SSSE3-NEXT:    movdqa %xmm8, (%rdi)
901; SSSE3-NEXT:    movdqa %xmm15, %xmm1
902; SSSE3-NEXT:    movdqa %xmm5, %xmm2
903; SSSE3-NEXT:    movdqa %xmm6, %xmm3
904; SSSE3-NEXT:    retq
905;
906; SSE41-LABEL: umulo_v16i32:
907; SSE41:       # %bb.0:
908; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
909; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3]
910; SSE41-NEXT:    pmuludq %xmm8, %xmm9
911; SSE41-NEXT:    movdqa %xmm0, %xmm8
912; SSE41-NEXT:    pmuludq %xmm4, %xmm8
913; SSE41-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
914; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7]
915; SSE41-NEXT:    pxor %xmm12, %xmm12
916; SSE41-NEXT:    pcmpeqd %xmm12, %xmm8
917; SSE41-NEXT:    pcmpeqd %xmm13, %xmm13
918; SSE41-NEXT:    pxor %xmm13, %xmm8
919; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3]
920; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3]
921; SSE41-NEXT:    pmuludq %xmm9, %xmm10
922; SSE41-NEXT:    movdqa %xmm1, %xmm9
923; SSE41-NEXT:    pmuludq %xmm5, %xmm9
924; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
925; SSE41-NEXT:    pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7]
926; SSE41-NEXT:    pcmpeqd %xmm12, %xmm9
927; SSE41-NEXT:    pxor %xmm13, %xmm9
928; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3]
929; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
930; SSE41-NEXT:    pmuludq %xmm10, %xmm11
931; SSE41-NEXT:    movdqa %xmm2, %xmm10
932; SSE41-NEXT:    pmuludq %xmm6, %xmm10
933; SSE41-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
934; SSE41-NEXT:    pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
935; SSE41-NEXT:    pcmpeqd %xmm12, %xmm10
936; SSE41-NEXT:    pxor %xmm13, %xmm10
937; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3]
938; SSE41-NEXT:    pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3]
939; SSE41-NEXT:    pmuludq %xmm11, %xmm14
940; SSE41-NEXT:    movdqa %xmm3, %xmm11
941; SSE41-NEXT:    pmuludq %xmm7, %xmm11
942; SSE41-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
943; SSE41-NEXT:    pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7]
944; SSE41-NEXT:    pcmpeqd %xmm12, %xmm11
945; SSE41-NEXT:    pxor %xmm13, %xmm11
946; SSE41-NEXT:    pmulld %xmm4, %xmm0
947; SSE41-NEXT:    pmulld %xmm5, %xmm1
948; SSE41-NEXT:    pmulld %xmm6, %xmm2
949; SSE41-NEXT:    pmulld %xmm7, %xmm3
950; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
951; SSE41-NEXT:    movdqa %xmm2, 32(%rdi)
952; SSE41-NEXT:    movdqa %xmm1, 16(%rdi)
953; SSE41-NEXT:    movdqa %xmm0, (%rdi)
954; SSE41-NEXT:    movdqa %xmm8, %xmm0
955; SSE41-NEXT:    movdqa %xmm9, %xmm1
956; SSE41-NEXT:    movdqa %xmm10, %xmm2
957; SSE41-NEXT:    movdqa %xmm11, %xmm3
958; SSE41-NEXT:    retq
959;
960; AVX1-LABEL: umulo_v16i32:
961; AVX1:       # %bb.0:
962; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm10
963; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3]
964; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
965; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3]
966; AVX1-NEXT:    vpmuludq %xmm6, %xmm7, %xmm6
967; AVX1-NEXT:    vpmuludq %xmm10, %xmm12, %xmm7
968; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
969; AVX1-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
970; AVX1-NEXT:    vpxor %xmm8, %xmm8, %xmm8
971; AVX1-NEXT:    vpcmpeqd %xmm7, %xmm8, %xmm7
972; AVX1-NEXT:    vpcmpeqd %xmm9, %xmm9, %xmm9
973; AVX1-NEXT:    vpxor %xmm7, %xmm9, %xmm7
974; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
975; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
976; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm4
977; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm6
978; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
979; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7]
980; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm8, %xmm4
981; AVX1-NEXT:    vpxor %xmm4, %xmm9, %xmm4
982; AVX1-NEXT:    vpackssdw %xmm7, %xmm4, %xmm11
983; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
984; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
985; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
986; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
987; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
988; AVX1-NEXT:    vpmuludq %xmm6, %xmm4, %xmm7
989; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
990; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
991; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
992; AVX1-NEXT:    vpxor %xmm5, %xmm9, %xmm13
993; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
994; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
995; AVX1-NEXT:    vpmuludq %xmm7, %xmm5, %xmm5
996; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm7
997; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
998; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7]
999; AVX1-NEXT:    vpcmpeqd %xmm5, %xmm8, %xmm5
1000; AVX1-NEXT:    vpxor %xmm5, %xmm9, %xmm5
1001; AVX1-NEXT:    vpackssdw %xmm13, %xmm5, %xmm5
1002; AVX1-NEXT:    vpacksswb %xmm11, %xmm5, %xmm5
1003; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm2
1004; AVX1-NEXT:    vpmulld %xmm6, %xmm4, %xmm4
1005; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm3
1006; AVX1-NEXT:    vpmulld %xmm10, %xmm12, %xmm6
1007; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm0
1008; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
1009; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1010; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1011; AVX1-NEXT:    vpacksswb %xmm11, %xmm11, %xmm1
1012; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm5
1013; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
1014; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1015; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm5, %ymm1
1016; AVX1-NEXT:    vmovdqa %xmm6, 48(%rdi)
1017; AVX1-NEXT:    vmovdqa %xmm3, 32(%rdi)
1018; AVX1-NEXT:    vmovdqa %xmm4, 16(%rdi)
1019; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
1020; AVX1-NEXT:    retq
1021;
1022; AVX2-LABEL: umulo_v16i32:
1023; AVX2:       # %bb.0:
1024; AVX2-NEXT:    vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7]
1025; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7]
1026; AVX2-NEXT:    vpmuludq %ymm4, %ymm5, %ymm4
1027; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm5
1028; AVX2-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7]
1029; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7]
1030; AVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
1031; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm4, %ymm4
1032; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
1033; AVX2-NEXT:    vpxor %ymm6, %ymm4, %ymm4
1034; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
1035; AVX2-NEXT:    vpackssdw %xmm7, %xmm4, %xmm4
1036; AVX2-NEXT:    vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7]
1037; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7]
1038; AVX2-NEXT:    vpmuludq %ymm7, %ymm8, %ymm7
1039; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm8
1040; AVX2-NEXT:    vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7]
1041; AVX2-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
1042; AVX2-NEXT:    vpcmpeqd %ymm5, %ymm7, %ymm5
1043; AVX2-NEXT:    vpxor %ymm6, %ymm5, %ymm5
1044; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
1045; AVX2-NEXT:    vpackssdw %xmm6, %xmm5, %xmm5
1046; AVX2-NEXT:    vpacksswb %xmm5, %xmm5, %xmm5
1047; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm2
1048; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm3
1049; AVX2-NEXT:    vpmovsxbd %xmm5, %ymm0
1050; AVX2-NEXT:    vpacksswb %xmm4, %xmm4, %xmm1
1051; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1052; AVX2-NEXT:    vmovdqa %ymm3, 32(%rdi)
1053; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
1054; AVX2-NEXT:    retq
1055;
1056; AVX512-LABEL: umulo_v16i32:
1057; AVX512:       # %bb.0:
1058; AVX512-NEXT:    vpmuludq %zmm1, %zmm0, %zmm2
1059; AVX512-NEXT:    vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1060; AVX512-NEXT:    vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
1061; AVX512-NEXT:    vpmuludq %zmm3, %zmm4, %zmm3
1062; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31]
1063; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
1064; AVX512-NEXT:    vptestmd %zmm4, %zmm4, %k1
1065; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm1
1066; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1067; AVX512-NEXT:    vmovdqa64 %zmm1, (%rdi)
1068; AVX512-NEXT:    retq
1069  %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1)
1070  %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0
1071  %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1
1072  %res = sext <16 x i1> %obit to <16 x i32>
1073  store <16 x i32> %val, <16 x i32>* %p2
1074  ret <16 x i32> %res
1075}
1076
1077define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind {
1078; SSE2-LABEL: umulo_v16i8:
1079; SSE2:       # %bb.0:
1080; SSE2-NEXT:    pxor %xmm2, %xmm2
1081; SSE2-NEXT:    movdqa %xmm1, %xmm3
1082; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1083; SSE2-NEXT:    movdqa %xmm0, %xmm5
1084; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1085; SSE2-NEXT:    pmullw %xmm3, %xmm5
1086; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1087; SSE2-NEXT:    movdqa %xmm5, %xmm3
1088; SSE2-NEXT:    pand %xmm4, %xmm3
1089; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1090; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1091; SSE2-NEXT:    pmullw %xmm1, %xmm0
1092; SSE2-NEXT:    pand %xmm0, %xmm4
1093; SSE2-NEXT:    packuswb %xmm3, %xmm4
1094; SSE2-NEXT:    psrlw $8, %xmm5
1095; SSE2-NEXT:    psrlw $8, %xmm0
1096; SSE2-NEXT:    packuswb %xmm5, %xmm0
1097; SSE2-NEXT:    pcmpeqb %xmm0, %xmm2
1098; SSE2-NEXT:    pcmpeqd %xmm3, %xmm3
1099; SSE2-NEXT:    pxor %xmm2, %xmm3
1100; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1101; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1102; SSE2-NEXT:    psrad $24, %xmm0
1103; SSE2-NEXT:    movdqa %xmm3, %xmm1
1104; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1105; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1106; SSE2-NEXT:    pslld $31, %xmm1
1107; SSE2-NEXT:    psrad $31, %xmm1
1108; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1109; SSE2-NEXT:    movdqa %xmm3, %xmm2
1110; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1111; SSE2-NEXT:    pslld $31, %xmm2
1112; SSE2-NEXT:    psrad $31, %xmm2
1113; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1114; SSE2-NEXT:    pslld $31, %xmm3
1115; SSE2-NEXT:    psrad $31, %xmm3
1116; SSE2-NEXT:    movdqa %xmm4, (%rdi)
1117; SSE2-NEXT:    retq
1118;
1119; SSSE3-LABEL: umulo_v16i8:
1120; SSSE3:       # %bb.0:
1121; SSSE3-NEXT:    pxor %xmm2, %xmm2
1122; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1123; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1124; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1125; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
1126; SSSE3-NEXT:    pmullw %xmm3, %xmm5
1127; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1128; SSSE3-NEXT:    movdqa %xmm5, %xmm3
1129; SSSE3-NEXT:    pand %xmm4, %xmm3
1130; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1131; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1132; SSSE3-NEXT:    pmullw %xmm1, %xmm0
1133; SSSE3-NEXT:    pand %xmm0, %xmm4
1134; SSSE3-NEXT:    packuswb %xmm3, %xmm4
1135; SSSE3-NEXT:    psrlw $8, %xmm5
1136; SSSE3-NEXT:    psrlw $8, %xmm0
1137; SSSE3-NEXT:    packuswb %xmm5, %xmm0
1138; SSSE3-NEXT:    pcmpeqb %xmm0, %xmm2
1139; SSSE3-NEXT:    pcmpeqd %xmm3, %xmm3
1140; SSSE3-NEXT:    pxor %xmm2, %xmm3
1141; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
1142; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
1143; SSSE3-NEXT:    psrad $24, %xmm0
1144; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1145; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1146; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1147; SSSE3-NEXT:    pslld $31, %xmm1
1148; SSSE3-NEXT:    psrad $31, %xmm1
1149; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1150; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1151; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1152; SSSE3-NEXT:    pslld $31, %xmm2
1153; SSSE3-NEXT:    psrad $31, %xmm2
1154; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1155; SSSE3-NEXT:    pslld $31, %xmm3
1156; SSSE3-NEXT:    psrad $31, %xmm3
1157; SSSE3-NEXT:    movdqa %xmm4, (%rdi)
1158; SSSE3-NEXT:    retq
1159;
1160; SSE41-LABEL: umulo_v16i8:
1161; SSE41:       # %bb.0:
1162; SSE41-NEXT:    pxor %xmm2, %xmm2
1163; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1164; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1165; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1166; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1167; SSE41-NEXT:    pmullw %xmm1, %xmm0
1168; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1169; SSE41-NEXT:    movdqa %xmm0, %xmm1
1170; SSE41-NEXT:    pand %xmm4, %xmm1
1171; SSE41-NEXT:    pmullw %xmm3, %xmm5
1172; SSE41-NEXT:    pand %xmm5, %xmm4
1173; SSE41-NEXT:    packuswb %xmm1, %xmm4
1174; SSE41-NEXT:    psrlw $8, %xmm0
1175; SSE41-NEXT:    psrlw $8, %xmm5
1176; SSE41-NEXT:    packuswb %xmm0, %xmm5
1177; SSE41-NEXT:    pcmpeqb %xmm2, %xmm5
1178; SSE41-NEXT:    pcmpeqd %xmm3, %xmm3
1179; SSE41-NEXT:    pxor %xmm5, %xmm3
1180; SSE41-NEXT:    pmovsxbd %xmm3, %xmm0
1181; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
1182; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1183; SSE41-NEXT:    pslld $31, %xmm1
1184; SSE41-NEXT:    psrad $31, %xmm1
1185; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1186; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1187; SSE41-NEXT:    pslld $31, %xmm2
1188; SSE41-NEXT:    psrad $31, %xmm2
1189; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
1190; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1191; SSE41-NEXT:    pslld $31, %xmm3
1192; SSE41-NEXT:    psrad $31, %xmm3
1193; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1194; SSE41-NEXT:    retq
1195;
1196; AVX1-LABEL: umulo_v16i8:
1197; AVX1:       # %bb.0:
1198; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1199; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1200; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1201; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1202; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1203; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm5
1204; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1205; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1206; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1207; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
1208; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm4
1209; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm1
1210; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1211; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1212; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1213; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1214; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1215; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1216; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1217; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1218; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1219; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1220; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1221; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1222; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1223; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
1224; AVX1-NEXT:    vmovdqa %xmm4, (%rdi)
1225; AVX1-NEXT:    retq
1226;
1227; AVX2-LABEL: umulo_v16i8:
1228; AVX2:       # %bb.0:
1229; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1230; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1231; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1232; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1233; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1234; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm2
1235; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1236; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1237; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1238; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1239; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
1240; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1241; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1242; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1243; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1244; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1245; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
1246; AVX2-NEXT:    retq
1247;
1248; AVX512F-LABEL: umulo_v16i8:
1249; AVX512F:       # %bb.0:
1250; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1251; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1252; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1253; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm0
1254; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1255; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
1256; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1257; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
1258; AVX512F-NEXT:    vpmovdb %zmm1, (%rdi)
1259; AVX512F-NEXT:    retq
1260;
1261; AVX512BW-LABEL: umulo_v16i8:
1262; AVX512BW:       # %bb.0:
1263; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1264; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1265; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm1
1266; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm0
1267; AVX512BW-NEXT:    vptestmw %ymm0, %ymm0, %k1
1268; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1269; AVX512BW-NEXT:    vpmovwb %ymm1, (%rdi)
1270; AVX512BW-NEXT:    retq
1271  %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1)
1272  %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0
1273  %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1
1274  %res = sext <16 x i1> %obit to <16 x i32>
1275  store <16 x i8> %val, <16 x i8>* %p2
1276  ret <16 x i32> %res
1277}
1278
1279define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nounwind {
1280; SSE2-LABEL: umulo_v32i8:
1281; SSE2:       # %bb.0:
1282; SSE2-NEXT:    movq %rdi, %rax
1283; SSE2-NEXT:    pxor %xmm5, %xmm5
1284; SSE2-NEXT:    movdqa %xmm2, %xmm4
1285; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1286; SSE2-NEXT:    movdqa %xmm0, %xmm6
1287; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1288; SSE2-NEXT:    pmullw %xmm4, %xmm6
1289; SSE2-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
1290; SSE2-NEXT:    movdqa %xmm6, %xmm7
1291; SSE2-NEXT:    pand %xmm11, %xmm7
1292; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1293; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1294; SSE2-NEXT:    pmullw %xmm2, %xmm0
1295; SSE2-NEXT:    movdqa %xmm0, %xmm8
1296; SSE2-NEXT:    pand %xmm11, %xmm8
1297; SSE2-NEXT:    packuswb %xmm7, %xmm8
1298; SSE2-NEXT:    movdqa %xmm3, %xmm7
1299; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1300; SSE2-NEXT:    movdqa %xmm1, %xmm2
1301; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1302; SSE2-NEXT:    pmullw %xmm7, %xmm2
1303; SSE2-NEXT:    movdqa %xmm2, %xmm7
1304; SSE2-NEXT:    pand %xmm11, %xmm7
1305; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1306; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1307; SSE2-NEXT:    pmullw %xmm3, %xmm1
1308; SSE2-NEXT:    pand %xmm1, %xmm11
1309; SSE2-NEXT:    packuswb %xmm7, %xmm11
1310; SSE2-NEXT:    psrlw $8, %xmm2
1311; SSE2-NEXT:    psrlw $8, %xmm1
1312; SSE2-NEXT:    packuswb %xmm2, %xmm1
1313; SSE2-NEXT:    pcmpeqb %xmm5, %xmm1
1314; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1315; SSE2-NEXT:    pxor %xmm2, %xmm1
1316; SSE2-NEXT:    psrlw $8, %xmm6
1317; SSE2-NEXT:    psrlw $8, %xmm0
1318; SSE2-NEXT:    packuswb %xmm6, %xmm0
1319; SSE2-NEXT:    pcmpeqb %xmm5, %xmm0
1320; SSE2-NEXT:    pxor %xmm2, %xmm0
1321; SSE2-NEXT:    movdqa %xmm0, %xmm3
1322; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1323; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1324; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1325; SSE2-NEXT:    pslld $31, %xmm0
1326; SSE2-NEXT:    psrad $31, %xmm0
1327; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1328; SSE2-NEXT:    movdqa %xmm3, %xmm5
1329; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1330; SSE2-NEXT:    pslld $31, %xmm5
1331; SSE2-NEXT:    psrad $31, %xmm5
1332; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1333; SSE2-NEXT:    pslld $31, %xmm3
1334; SSE2-NEXT:    psrad $31, %xmm3
1335; SSE2-NEXT:    movdqa %xmm1, %xmm6
1336; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1337; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1338; SSE2-NEXT:    pslld $31, %xmm6
1339; SSE2-NEXT:    psrad $31, %xmm6
1340; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
1341; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1342; SSE2-NEXT:    movdqa %xmm1, %xmm2
1343; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1344; SSE2-NEXT:    pslld $31, %xmm2
1345; SSE2-NEXT:    psrad $31, %xmm2
1346; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1347; SSE2-NEXT:    pslld $31, %xmm1
1348; SSE2-NEXT:    psrad $31, %xmm1
1349; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
1350; SSE2-NEXT:    psrad $24, %xmm7
1351; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
1352; SSE2-NEXT:    psrad $24, %xmm4
1353; SSE2-NEXT:    movdqa %xmm11, 16(%rsi)
1354; SSE2-NEXT:    movdqa %xmm8, (%rsi)
1355; SSE2-NEXT:    movdqa %xmm4, 64(%rdi)
1356; SSE2-NEXT:    movdqa %xmm7, (%rdi)
1357; SSE2-NEXT:    movdqa %xmm1, 112(%rdi)
1358; SSE2-NEXT:    movdqa %xmm2, 96(%rdi)
1359; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
1360; SSE2-NEXT:    movdqa %xmm3, 48(%rdi)
1361; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
1362; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1363; SSE2-NEXT:    retq
1364;
1365; SSSE3-LABEL: umulo_v32i8:
1366; SSSE3:       # %bb.0:
1367; SSSE3-NEXT:    movq %rdi, %rax
1368; SSSE3-NEXT:    pxor %xmm5, %xmm5
1369; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1370; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
1371; SSSE3-NEXT:    movdqa %xmm0, %xmm6
1372; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
1373; SSSE3-NEXT:    pmullw %xmm4, %xmm6
1374; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
1375; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1376; SSSE3-NEXT:    pand %xmm11, %xmm7
1377; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
1378; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
1379; SSSE3-NEXT:    pmullw %xmm2, %xmm0
1380; SSSE3-NEXT:    movdqa %xmm0, %xmm8
1381; SSSE3-NEXT:    pand %xmm11, %xmm8
1382; SSSE3-NEXT:    packuswb %xmm7, %xmm8
1383; SSSE3-NEXT:    movdqa %xmm3, %xmm7
1384; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
1385; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1386; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
1387; SSSE3-NEXT:    pmullw %xmm7, %xmm2
1388; SSSE3-NEXT:    movdqa %xmm2, %xmm7
1389; SSSE3-NEXT:    pand %xmm11, %xmm7
1390; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
1391; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
1392; SSSE3-NEXT:    pmullw %xmm3, %xmm1
1393; SSSE3-NEXT:    pand %xmm1, %xmm11
1394; SSSE3-NEXT:    packuswb %xmm7, %xmm11
1395; SSSE3-NEXT:    psrlw $8, %xmm2
1396; SSSE3-NEXT:    psrlw $8, %xmm1
1397; SSSE3-NEXT:    packuswb %xmm2, %xmm1
1398; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm1
1399; SSSE3-NEXT:    pcmpeqd %xmm2, %xmm2
1400; SSSE3-NEXT:    pxor %xmm2, %xmm1
1401; SSSE3-NEXT:    psrlw $8, %xmm6
1402; SSSE3-NEXT:    psrlw $8, %xmm0
1403; SSSE3-NEXT:    packuswb %xmm6, %xmm0
1404; SSSE3-NEXT:    pcmpeqb %xmm5, %xmm0
1405; SSSE3-NEXT:    pxor %xmm2, %xmm0
1406; SSSE3-NEXT:    movdqa %xmm0, %xmm3
1407; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
1408; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1409; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1410; SSSE3-NEXT:    pslld $31, %xmm0
1411; SSSE3-NEXT:    psrad $31, %xmm0
1412; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1413; SSSE3-NEXT:    movdqa %xmm3, %xmm5
1414; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1415; SSSE3-NEXT:    pslld $31, %xmm5
1416; SSSE3-NEXT:    psrad $31, %xmm5
1417; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1418; SSSE3-NEXT:    pslld $31, %xmm3
1419; SSSE3-NEXT:    psrad $31, %xmm3
1420; SSSE3-NEXT:    movdqa %xmm1, %xmm6
1421; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1422; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
1423; SSSE3-NEXT:    pslld $31, %xmm6
1424; SSSE3-NEXT:    psrad $31, %xmm6
1425; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
1426; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1427; SSSE3-NEXT:    movdqa %xmm1, %xmm2
1428; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1429; SSSE3-NEXT:    pslld $31, %xmm2
1430; SSSE3-NEXT:    psrad $31, %xmm2
1431; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1432; SSSE3-NEXT:    pslld $31, %xmm1
1433; SSSE3-NEXT:    psrad $31, %xmm1
1434; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
1435; SSSE3-NEXT:    psrad $24, %xmm7
1436; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
1437; SSSE3-NEXT:    psrad $24, %xmm4
1438; SSSE3-NEXT:    movdqa %xmm11, 16(%rsi)
1439; SSSE3-NEXT:    movdqa %xmm8, (%rsi)
1440; SSSE3-NEXT:    movdqa %xmm4, 64(%rdi)
1441; SSSE3-NEXT:    movdqa %xmm7, (%rdi)
1442; SSSE3-NEXT:    movdqa %xmm1, 112(%rdi)
1443; SSSE3-NEXT:    movdqa %xmm2, 96(%rdi)
1444; SSSE3-NEXT:    movdqa %xmm6, 80(%rdi)
1445; SSSE3-NEXT:    movdqa %xmm3, 48(%rdi)
1446; SSSE3-NEXT:    movdqa %xmm5, 32(%rdi)
1447; SSSE3-NEXT:    movdqa %xmm0, 16(%rdi)
1448; SSSE3-NEXT:    retq
1449;
1450; SSE41-LABEL: umulo_v32i8:
1451; SSE41:       # %bb.0:
1452; SSE41-NEXT:    movq %rdi, %rax
1453; SSE41-NEXT:    pxor %xmm8, %xmm8
1454; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
1455; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15]
1456; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1457; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
1458; SSE41-NEXT:    pmullw %xmm2, %xmm0
1459; SSE41-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
1460; SSE41-NEXT:    movdqa %xmm0, %xmm6
1461; SSE41-NEXT:    pand %xmm10, %xmm6
1462; SSE41-NEXT:    pmullw %xmm5, %xmm4
1463; SSE41-NEXT:    movdqa %xmm4, %xmm9
1464; SSE41-NEXT:    pand %xmm10, %xmm9
1465; SSE41-NEXT:    packuswb %xmm6, %xmm9
1466; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
1467; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15]
1468; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1469; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15]
1470; SSE41-NEXT:    pmullw %xmm3, %xmm1
1471; SSE41-NEXT:    movdqa %xmm1, %xmm3
1472; SSE41-NEXT:    pand %xmm10, %xmm3
1473; SSE41-NEXT:    pmullw %xmm7, %xmm6
1474; SSE41-NEXT:    pand %xmm6, %xmm10
1475; SSE41-NEXT:    packuswb %xmm3, %xmm10
1476; SSE41-NEXT:    psrlw $8, %xmm1
1477; SSE41-NEXT:    psrlw $8, %xmm6
1478; SSE41-NEXT:    packuswb %xmm1, %xmm6
1479; SSE41-NEXT:    pcmpeqb %xmm8, %xmm6
1480; SSE41-NEXT:    pcmpeqd %xmm1, %xmm1
1481; SSE41-NEXT:    pxor %xmm1, %xmm6
1482; SSE41-NEXT:    psrlw $8, %xmm0
1483; SSE41-NEXT:    psrlw $8, %xmm4
1484; SSE41-NEXT:    packuswb %xmm0, %xmm4
1485; SSE41-NEXT:    pcmpeqb %xmm8, %xmm4
1486; SSE41-NEXT:    pxor %xmm1, %xmm4
1487; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
1488; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
1489; SSE41-NEXT:    pslld $31, %xmm0
1490; SSE41-NEXT:    psrad $31, %xmm0
1491; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
1492; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
1493; SSE41-NEXT:    pslld $31, %xmm1
1494; SSE41-NEXT:    psrad $31, %xmm1
1495; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3]
1496; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
1497; SSE41-NEXT:    pslld $31, %xmm3
1498; SSE41-NEXT:    psrad $31, %xmm3
1499; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
1500; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
1501; SSE41-NEXT:    pslld $31, %xmm7
1502; SSE41-NEXT:    psrad $31, %xmm7
1503; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1504; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
1505; SSE41-NEXT:    pslld $31, %xmm5
1506; SSE41-NEXT:    psrad $31, %xmm5
1507; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3]
1508; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
1509; SSE41-NEXT:    pslld $31, %xmm2
1510; SSE41-NEXT:    psrad $31, %xmm2
1511; SSE41-NEXT:    pmovsxbd %xmm4, %xmm4
1512; SSE41-NEXT:    pmovsxbd %xmm6, %xmm6
1513; SSE41-NEXT:    movdqa %xmm10, 16(%rsi)
1514; SSE41-NEXT:    movdqa %xmm9, (%rsi)
1515; SSE41-NEXT:    movdqa %xmm6, 64(%rdi)
1516; SSE41-NEXT:    movdqa %xmm4, (%rdi)
1517; SSE41-NEXT:    movdqa %xmm2, 112(%rdi)
1518; SSE41-NEXT:    movdqa %xmm5, 96(%rdi)
1519; SSE41-NEXT:    movdqa %xmm7, 80(%rdi)
1520; SSE41-NEXT:    movdqa %xmm3, 48(%rdi)
1521; SSE41-NEXT:    movdqa %xmm1, 32(%rdi)
1522; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
1523; SSE41-NEXT:    retq
1524;
1525; AVX1-LABEL: umulo_v32i8:
1526; AVX1:       # %bb.0:
1527; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1528; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1529; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1530; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
1531; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
1532; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm4
1533; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1534; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1535; AVX1-NEXT:    vpmullw %xmm6, %xmm7, %xmm6
1536; AVX1-NEXT:    vpand %xmm5, %xmm6, %xmm7
1537; AVX1-NEXT:    vpackuswb %xmm4, %xmm7, %xmm8
1538; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
1539; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1540; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1541; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1542; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm4
1543; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm7
1544; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1545; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1546; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1547; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm1
1548; AVX1-NEXT:    vpackuswb %xmm7, %xmm1, %xmm5
1549; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm1
1550; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1551; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1552; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1553; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
1554; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm4
1555; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm0
1556; AVX1-NEXT:    vpsrlw $8, %xmm6, %xmm3
1557; AVX1-NEXT:    vpackuswb %xmm0, %xmm3, %xmm0
1558; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm0
1559; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm1
1560; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm0
1561; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1562; AVX1-NEXT:    vpmovsxbd %xmm2, %xmm2
1563; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1564; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm2
1565; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1]
1566; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1567; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1568; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1569; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1570; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1571; AVX1-NEXT:    vpmovsxbd %xmm1, %xmm1
1572; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
1573; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
1574; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
1575; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
1576; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
1577; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm3
1578; AVX1-NEXT:    vmovdqa %xmm5, 16(%rdi)
1579; AVX1-NEXT:    vmovdqa %xmm8, (%rdi)
1580; AVX1-NEXT:    retq
1581;
1582; AVX2-LABEL: umulo_v32i8:
1583; AVX2:       # %bb.0:
1584; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1585; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
1586; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
1587; AVX2-NEXT:    vpmullw %ymm3, %ymm4, %ymm3
1588; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
1589; AVX2-NEXT:    vpand %ymm4, %ymm3, %ymm5
1590; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
1591; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
1592; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1593; AVX2-NEXT:    vpand %ymm4, %ymm0, %ymm1
1594; AVX2-NEXT:    vpackuswb %ymm5, %ymm1, %ymm4
1595; AVX2-NEXT:    vpsrlw $8, %ymm3, %ymm1
1596; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1597; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1598; AVX2-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm0
1599; AVX2-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
1600; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
1601; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm0
1602; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm3
1603; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm2
1604; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1605; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
1606; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
1607; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
1608; AVX2-NEXT:    vmovdqa %ymm4, (%rdi)
1609; AVX2-NEXT:    retq
1610;
1611; AVX512F-LABEL: umulo_v32i8:
1612; AVX512F:       # %bb.0:
1613; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
1614; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
1615; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
1616; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
1617; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
1618; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm3
1619; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1620; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
1621; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1622; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1623; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm3
1624; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm0
1625; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
1626; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
1627; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
1628; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
1629; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
1630; AVX512F-NEXT:    vpmovdb %zmm2, 16(%rdi)
1631; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
1632; AVX512F-NEXT:    vpmovdb %zmm2, (%rdi)
1633; AVX512F-NEXT:    retq
1634;
1635; AVX512BW-LABEL: umulo_v32i8:
1636; AVX512BW:       # %bb.0:
1637; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
1638; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1639; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm2
1640; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm0
1641; AVX512BW-NEXT:    vptestmw %zmm0, %zmm0, %k1
1642; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
1643; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
1644; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
1645; AVX512BW-NEXT:    vpmovwb %zmm2, (%rdi)
1646; AVX512BW-NEXT:    retq
1647  %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1)
1648  %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0
1649  %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1
1650  %res = sext <32 x i1> %obit to <32 x i32>
1651  store <32 x i8> %val, <32 x i8>* %p2
1652  ret <32 x i32> %res
1653}
1654
1655define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nounwind {
1656; SSE2-LABEL: umulo_v64i8:
1657; SSE2:       # %bb.0:
1658; SSE2-NEXT:    movq %rdi, %rax
1659; SSE2-NEXT:    pxor %xmm9, %xmm9
1660; SSE2-NEXT:    movdqa %xmm4, %xmm8
1661; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
1662; SSE2-NEXT:    movdqa %xmm0, %xmm10
1663; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
1664; SSE2-NEXT:    pmullw %xmm8, %xmm10
1665; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1666; SSE2-NEXT:    movdqa %xmm10, %xmm12
1667; SSE2-NEXT:    pand %xmm8, %xmm12
1668; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1669; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1670; SSE2-NEXT:    pmullw %xmm4, %xmm0
1671; SSE2-NEXT:    movdqa %xmm0, %xmm11
1672; SSE2-NEXT:    pand %xmm8, %xmm11
1673; SSE2-NEXT:    packuswb %xmm12, %xmm11
1674; SSE2-NEXT:    movdqa %xmm5, %xmm4
1675; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1676; SSE2-NEXT:    movdqa %xmm1, %xmm13
1677; SSE2-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
1678; SSE2-NEXT:    pmullw %xmm4, %xmm13
1679; SSE2-NEXT:    movdqa %xmm13, %xmm4
1680; SSE2-NEXT:    pand %xmm8, %xmm4
1681; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1682; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1683; SSE2-NEXT:    pmullw %xmm5, %xmm1
1684; SSE2-NEXT:    movdqa %xmm1, %xmm12
1685; SSE2-NEXT:    pand %xmm8, %xmm12
1686; SSE2-NEXT:    packuswb %xmm4, %xmm12
1687; SSE2-NEXT:    movdqa %xmm6, %xmm4
1688; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1689; SSE2-NEXT:    movdqa %xmm2, %xmm5
1690; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
1691; SSE2-NEXT:    pmullw %xmm4, %xmm5
1692; SSE2-NEXT:    movdqa %xmm5, %xmm4
1693; SSE2-NEXT:    pand %xmm8, %xmm4
1694; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1695; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
1696; SSE2-NEXT:    pmullw %xmm6, %xmm2
1697; SSE2-NEXT:    movdqa %xmm2, %xmm14
1698; SSE2-NEXT:    pand %xmm8, %xmm14
1699; SSE2-NEXT:    packuswb %xmm4, %xmm14
1700; SSE2-NEXT:    movdqa %xmm7, %xmm4
1701; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1702; SSE2-NEXT:    movdqa %xmm3, %xmm6
1703; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
1704; SSE2-NEXT:    pmullw %xmm4, %xmm6
1705; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
1706; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
1707; SSE2-NEXT:    pmullw %xmm7, %xmm3
1708; SSE2-NEXT:    movdqa %xmm6, %xmm4
1709; SSE2-NEXT:    pand %xmm8, %xmm4
1710; SSE2-NEXT:    pand %xmm3, %xmm8
1711; SSE2-NEXT:    packuswb %xmm4, %xmm8
1712; SSE2-NEXT:    psrlw $8, %xmm6
1713; SSE2-NEXT:    psrlw $8, %xmm3
1714; SSE2-NEXT:    packuswb %xmm6, %xmm3
1715; SSE2-NEXT:    psrlw $8, %xmm5
1716; SSE2-NEXT:    psrlw $8, %xmm2
1717; SSE2-NEXT:    packuswb %xmm5, %xmm2
1718; SSE2-NEXT:    psrlw $8, %xmm13
1719; SSE2-NEXT:    psrlw $8, %xmm1
1720; SSE2-NEXT:    packuswb %xmm13, %xmm1
1721; SSE2-NEXT:    psrlw $8, %xmm10
1722; SSE2-NEXT:    psrlw $8, %xmm0
1723; SSE2-NEXT:    packuswb %xmm10, %xmm0
1724; SSE2-NEXT:    pcmpeqb %xmm9, %xmm3
1725; SSE2-NEXT:    pcmpeqb %xmm9, %xmm2
1726; SSE2-NEXT:    pcmpeqb %xmm9, %xmm1
1727; SSE2-NEXT:    pcmpeqb %xmm9, %xmm0
1728; SSE2-NEXT:    pcmpeqd %xmm4, %xmm4
1729; SSE2-NEXT:    pxor %xmm4, %xmm3
1730; SSE2-NEXT:    pxor %xmm4, %xmm2
1731; SSE2-NEXT:    pxor %xmm4, %xmm1
1732; SSE2-NEXT:    pxor %xmm4, %xmm0
1733; SSE2-NEXT:    movdqa %xmm8, 48(%rsi)
1734; SSE2-NEXT:    movdqa %xmm14, 32(%rsi)
1735; SSE2-NEXT:    movdqa %xmm12, 16(%rsi)
1736; SSE2-NEXT:    movdqa %xmm3, %xmm4
1737; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1738; SSE2-NEXT:    movdqa %xmm11, (%rsi)
1739; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1740; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1741; SSE2-NEXT:    psrad $24, %xmm5
1742; SSE2-NEXT:    movdqa %xmm5, 192(%rdi)
1743; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1744; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1745; SSE2-NEXT:    psrad $24, %xmm5
1746; SSE2-NEXT:    movdqa %xmm5, 128(%rdi)
1747; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
1748; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1749; SSE2-NEXT:    psrad $24, %xmm5
1750; SSE2-NEXT:    movdqa %xmm5, 64(%rdi)
1751; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1752; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1753; SSE2-NEXT:    psrad $24, %xmm5
1754; SSE2-NEXT:    movdqa %xmm5, (%rdi)
1755; SSE2-NEXT:    movdqa %xmm4, %xmm5
1756; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1757; SSE2-NEXT:    pslld $31, %xmm4
1758; SSE2-NEXT:    psrad $31, %xmm4
1759; SSE2-NEXT:    movdqa %xmm4, 224(%rdi)
1760; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1761; SSE2-NEXT:    pslld $31, %xmm5
1762; SSE2-NEXT:    psrad $31, %xmm5
1763; SSE2-NEXT:    movdqa %xmm5, 240(%rdi)
1764; SSE2-NEXT:    movdqa %xmm2, %xmm4
1765; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1766; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1767; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1768; SSE2-NEXT:    pslld $31, %xmm3
1769; SSE2-NEXT:    psrad $31, %xmm3
1770; SSE2-NEXT:    movdqa %xmm3, 208(%rdi)
1771; SSE2-NEXT:    movdqa %xmm4, %xmm3
1772; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1773; SSE2-NEXT:    pslld $31, %xmm4
1774; SSE2-NEXT:    psrad $31, %xmm4
1775; SSE2-NEXT:    movdqa %xmm4, 160(%rdi)
1776; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1777; SSE2-NEXT:    pslld $31, %xmm3
1778; SSE2-NEXT:    psrad $31, %xmm3
1779; SSE2-NEXT:    movdqa %xmm3, 176(%rdi)
1780; SSE2-NEXT:    movdqa %xmm1, %xmm3
1781; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1782; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1783; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1784; SSE2-NEXT:    pslld $31, %xmm2
1785; SSE2-NEXT:    psrad $31, %xmm2
1786; SSE2-NEXT:    movdqa %xmm2, 144(%rdi)
1787; SSE2-NEXT:    movdqa %xmm3, %xmm2
1788; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1789; SSE2-NEXT:    pslld $31, %xmm3
1790; SSE2-NEXT:    psrad $31, %xmm3
1791; SSE2-NEXT:    movdqa %xmm3, 96(%rdi)
1792; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1793; SSE2-NEXT:    pslld $31, %xmm2
1794; SSE2-NEXT:    psrad $31, %xmm2
1795; SSE2-NEXT:    movdqa %xmm2, 112(%rdi)
1796; SSE2-NEXT:    movdqa %xmm0, %xmm2
1797; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1798; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1799; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1800; SSE2-NEXT:    pslld $31, %xmm1
1801; SSE2-NEXT:    psrad $31, %xmm1
1802; SSE2-NEXT:    movdqa %xmm1, 80(%rdi)
1803; SSE2-NEXT:    movdqa %xmm2, %xmm1
1804; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1805; SSE2-NEXT:    pslld $31, %xmm2
1806; SSE2-NEXT:    psrad $31, %xmm2
1807; SSE2-NEXT:    movdqa %xmm2, 32(%rdi)
1808; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1809; SSE2-NEXT:    pslld $31, %xmm1
1810; SSE2-NEXT:    psrad $31, %xmm1
1811; SSE2-NEXT:    movdqa %xmm1, 48(%rdi)
1812; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1813; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1814; SSE2-NEXT:    pslld $31, %xmm0
1815; SSE2-NEXT:    psrad $31, %xmm0
1816; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
1817; SSE2-NEXT:    retq
1818;
1819; SSSE3-LABEL: umulo_v64i8:
1820; SSSE3:       # %bb.0:
1821; SSSE3-NEXT:    movq %rdi, %rax
1822; SSSE3-NEXT:    pxor %xmm9, %xmm9
1823; SSSE3-NEXT:    movdqa %xmm4, %xmm8
1824; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
1825; SSSE3-NEXT:    movdqa %xmm0, %xmm10
1826; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
1827; SSSE3-NEXT:    pmullw %xmm8, %xmm10
1828; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
1829; SSSE3-NEXT:    movdqa %xmm10, %xmm12
1830; SSSE3-NEXT:    pand %xmm8, %xmm12
1831; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
1832; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
1833; SSSE3-NEXT:    pmullw %xmm4, %xmm0
1834; SSSE3-NEXT:    movdqa %xmm0, %xmm11
1835; SSSE3-NEXT:    pand %xmm8, %xmm11
1836; SSSE3-NEXT:    packuswb %xmm12, %xmm11
1837; SSSE3-NEXT:    movdqa %xmm5, %xmm4
1838; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1839; SSSE3-NEXT:    movdqa %xmm1, %xmm13
1840; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
1841; SSSE3-NEXT:    pmullw %xmm4, %xmm13
1842; SSSE3-NEXT:    movdqa %xmm13, %xmm4
1843; SSSE3-NEXT:    pand %xmm8, %xmm4
1844; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
1845; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
1846; SSSE3-NEXT:    pmullw %xmm5, %xmm1
1847; SSSE3-NEXT:    movdqa %xmm1, %xmm12
1848; SSSE3-NEXT:    pand %xmm8, %xmm12
1849; SSSE3-NEXT:    packuswb %xmm4, %xmm12
1850; SSSE3-NEXT:    movdqa %xmm6, %xmm4
1851; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1852; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1853; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
1854; SSSE3-NEXT:    pmullw %xmm4, %xmm5
1855; SSSE3-NEXT:    movdqa %xmm5, %xmm4
1856; SSSE3-NEXT:    pand %xmm8, %xmm4
1857; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
1858; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
1859; SSSE3-NEXT:    pmullw %xmm6, %xmm2
1860; SSSE3-NEXT:    movdqa %xmm2, %xmm14
1861; SSSE3-NEXT:    pand %xmm8, %xmm14
1862; SSSE3-NEXT:    packuswb %xmm4, %xmm14
1863; SSSE3-NEXT:    movdqa %xmm7, %xmm4
1864; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
1865; SSSE3-NEXT:    movdqa %xmm3, %xmm6
1866; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
1867; SSSE3-NEXT:    pmullw %xmm4, %xmm6
1868; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
1869; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
1870; SSSE3-NEXT:    pmullw %xmm7, %xmm3
1871; SSSE3-NEXT:    movdqa %xmm6, %xmm4
1872; SSSE3-NEXT:    pand %xmm8, %xmm4
1873; SSSE3-NEXT:    pand %xmm3, %xmm8
1874; SSSE3-NEXT:    packuswb %xmm4, %xmm8
1875; SSSE3-NEXT:    psrlw $8, %xmm6
1876; SSSE3-NEXT:    psrlw $8, %xmm3
1877; SSSE3-NEXT:    packuswb %xmm6, %xmm3
1878; SSSE3-NEXT:    psrlw $8, %xmm5
1879; SSSE3-NEXT:    psrlw $8, %xmm2
1880; SSSE3-NEXT:    packuswb %xmm5, %xmm2
1881; SSSE3-NEXT:    psrlw $8, %xmm13
1882; SSSE3-NEXT:    psrlw $8, %xmm1
1883; SSSE3-NEXT:    packuswb %xmm13, %xmm1
1884; SSSE3-NEXT:    psrlw $8, %xmm10
1885; SSSE3-NEXT:    psrlw $8, %xmm0
1886; SSSE3-NEXT:    packuswb %xmm10, %xmm0
1887; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm3
1888; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm2
1889; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm1
1890; SSSE3-NEXT:    pcmpeqb %xmm9, %xmm0
1891; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm4
1892; SSSE3-NEXT:    pxor %xmm4, %xmm3
1893; SSSE3-NEXT:    pxor %xmm4, %xmm2
1894; SSSE3-NEXT:    pxor %xmm4, %xmm1
1895; SSSE3-NEXT:    pxor %xmm4, %xmm0
1896; SSSE3-NEXT:    movdqa %xmm8, 48(%rsi)
1897; SSSE3-NEXT:    movdqa %xmm14, 32(%rsi)
1898; SSSE3-NEXT:    movdqa %xmm12, 16(%rsi)
1899; SSSE3-NEXT:    movdqa %xmm3, %xmm4
1900; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1901; SSSE3-NEXT:    movdqa %xmm11, (%rsi)
1902; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
1903; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1904; SSSE3-NEXT:    psrad $24, %xmm5
1905; SSSE3-NEXT:    movdqa %xmm5, 192(%rdi)
1906; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
1907; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1908; SSSE3-NEXT:    psrad $24, %xmm5
1909; SSSE3-NEXT:    movdqa %xmm5, 128(%rdi)
1910; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
1911; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1912; SSSE3-NEXT:    psrad $24, %xmm5
1913; SSSE3-NEXT:    movdqa %xmm5, 64(%rdi)
1914; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
1915; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
1916; SSSE3-NEXT:    psrad $24, %xmm5
1917; SSSE3-NEXT:    movdqa %xmm5, (%rdi)
1918; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1919; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1920; SSSE3-NEXT:    pslld $31, %xmm4
1921; SSSE3-NEXT:    psrad $31, %xmm4
1922; SSSE3-NEXT:    movdqa %xmm4, 224(%rdi)
1923; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
1924; SSSE3-NEXT:    pslld $31, %xmm5
1925; SSSE3-NEXT:    psrad $31, %xmm5
1926; SSSE3-NEXT:    movdqa %xmm5, 240(%rdi)
1927; SSSE3-NEXT:    movdqa %xmm2, %xmm4
1928; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1929; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1930; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1931; SSSE3-NEXT:    pslld $31, %xmm3
1932; SSSE3-NEXT:    psrad $31, %xmm3
1933; SSSE3-NEXT:    movdqa %xmm3, 208(%rdi)
1934; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1935; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
1936; SSSE3-NEXT:    pslld $31, %xmm4
1937; SSSE3-NEXT:    psrad $31, %xmm4
1938; SSSE3-NEXT:    movdqa %xmm4, 160(%rdi)
1939; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7]
1940; SSSE3-NEXT:    pslld $31, %xmm3
1941; SSSE3-NEXT:    psrad $31, %xmm3
1942; SSSE3-NEXT:    movdqa %xmm3, 176(%rdi)
1943; SSSE3-NEXT:    movdqa %xmm1, %xmm3
1944; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1945; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1946; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1947; SSSE3-NEXT:    pslld $31, %xmm2
1948; SSSE3-NEXT:    psrad $31, %xmm2
1949; SSSE3-NEXT:    movdqa %xmm2, 144(%rdi)
1950; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1951; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3]
1952; SSSE3-NEXT:    pslld $31, %xmm3
1953; SSSE3-NEXT:    psrad $31, %xmm3
1954; SSSE3-NEXT:    movdqa %xmm3, 96(%rdi)
1955; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
1956; SSSE3-NEXT:    pslld $31, %xmm2
1957; SSSE3-NEXT:    psrad $31, %xmm2
1958; SSSE3-NEXT:    movdqa %xmm2, 112(%rdi)
1959; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1960; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1961; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1962; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1963; SSSE3-NEXT:    pslld $31, %xmm1
1964; SSSE3-NEXT:    psrad $31, %xmm1
1965; SSSE3-NEXT:    movdqa %xmm1, 80(%rdi)
1966; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1967; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
1968; SSSE3-NEXT:    pslld $31, %xmm2
1969; SSSE3-NEXT:    psrad $31, %xmm2
1970; SSSE3-NEXT:    movdqa %xmm2, 32(%rdi)
1971; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
1972; SSSE3-NEXT:    pslld $31, %xmm1
1973; SSSE3-NEXT:    psrad $31, %xmm1
1974; SSSE3-NEXT:    movdqa %xmm1, 48(%rdi)
1975; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1976; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
1977; SSSE3-NEXT:    pslld $31, %xmm0
1978; SSSE3-NEXT:    psrad $31, %xmm0
1979; SSSE3-NEXT:    movdqa %xmm0, 16(%rdi)
1980; SSSE3-NEXT:    retq
1981;
1982; SSE41-LABEL: umulo_v64i8:
1983; SSE41:       # %bb.0:
1984; SSE41-NEXT:    movq %rdi, %rax
1985; SSE41-NEXT:    pxor %xmm13, %xmm13
1986; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
1987; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15]
1988; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1989; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15]
1990; SSE41-NEXT:    pmullw %xmm4, %xmm0
1991; SSE41-NEXT:    movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255]
1992; SSE41-NEXT:    movdqa %xmm0, %xmm4
1993; SSE41-NEXT:    pand %xmm9, %xmm4
1994; SSE41-NEXT:    pmullw %xmm10, %xmm8
1995; SSE41-NEXT:    movdqa %xmm8, %xmm10
1996; SSE41-NEXT:    pand %xmm9, %xmm10
1997; SSE41-NEXT:    packuswb %xmm4, %xmm10
1998; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
1999; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15]
2000; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2001; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15]
2002; SSE41-NEXT:    pmullw %xmm5, %xmm1
2003; SSE41-NEXT:    movdqa %xmm1, %xmm5
2004; SSE41-NEXT:    pand %xmm9, %xmm5
2005; SSE41-NEXT:    pmullw %xmm11, %xmm4
2006; SSE41-NEXT:    movdqa %xmm4, %xmm11
2007; SSE41-NEXT:    pand %xmm9, %xmm11
2008; SSE41-NEXT:    packuswb %xmm5, %xmm11
2009; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
2010; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15]
2011; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2012; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
2013; SSE41-NEXT:    pmullw %xmm6, %xmm2
2014; SSE41-NEXT:    movdqa %xmm2, %xmm6
2015; SSE41-NEXT:    pand %xmm9, %xmm6
2016; SSE41-NEXT:    pmullw %xmm12, %xmm5
2017; SSE41-NEXT:    movdqa %xmm5, %xmm12
2018; SSE41-NEXT:    pand %xmm9, %xmm12
2019; SSE41-NEXT:    packuswb %xmm6, %xmm12
2020; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
2021; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15]
2022; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2023; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
2024; SSE41-NEXT:    pmullw %xmm7, %xmm3
2025; SSE41-NEXT:    pmullw %xmm14, %xmm6
2026; SSE41-NEXT:    movdqa %xmm3, %xmm7
2027; SSE41-NEXT:    pand %xmm9, %xmm7
2028; SSE41-NEXT:    pand %xmm6, %xmm9
2029; SSE41-NEXT:    packuswb %xmm7, %xmm9
2030; SSE41-NEXT:    psrlw $8, %xmm3
2031; SSE41-NEXT:    psrlw $8, %xmm6
2032; SSE41-NEXT:    packuswb %xmm3, %xmm6
2033; SSE41-NEXT:    psrlw $8, %xmm2
2034; SSE41-NEXT:    psrlw $8, %xmm5
2035; SSE41-NEXT:    packuswb %xmm2, %xmm5
2036; SSE41-NEXT:    psrlw $8, %xmm1
2037; SSE41-NEXT:    psrlw $8, %xmm4
2038; SSE41-NEXT:    packuswb %xmm1, %xmm4
2039; SSE41-NEXT:    psrlw $8, %xmm0
2040; SSE41-NEXT:    psrlw $8, %xmm8
2041; SSE41-NEXT:    packuswb %xmm0, %xmm8
2042; SSE41-NEXT:    pcmpeqb %xmm13, %xmm6
2043; SSE41-NEXT:    pcmpeqb %xmm13, %xmm5
2044; SSE41-NEXT:    pcmpeqb %xmm13, %xmm4
2045; SSE41-NEXT:    pcmpeqb %xmm13, %xmm8
2046; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2047; SSE41-NEXT:    pxor %xmm0, %xmm6
2048; SSE41-NEXT:    pxor %xmm0, %xmm5
2049; SSE41-NEXT:    pxor %xmm0, %xmm4
2050; SSE41-NEXT:    pxor %xmm0, %xmm8
2051; SSE41-NEXT:    movdqa %xmm9, 48(%rsi)
2052; SSE41-NEXT:    movdqa %xmm12, 32(%rsi)
2053; SSE41-NEXT:    movdqa %xmm11, 16(%rsi)
2054; SSE41-NEXT:    movdqa %xmm10, (%rsi)
2055; SSE41-NEXT:    pmovsxbd %xmm6, %xmm0
2056; SSE41-NEXT:    movdqa %xmm0, 192(%rdi)
2057; SSE41-NEXT:    pmovsxbd %xmm5, %xmm0
2058; SSE41-NEXT:    movdqa %xmm0, 128(%rdi)
2059; SSE41-NEXT:    pmovsxbd %xmm4, %xmm0
2060; SSE41-NEXT:    movdqa %xmm0, 64(%rdi)
2061; SSE41-NEXT:    pmovsxbd %xmm8, %xmm0
2062; SSE41-NEXT:    movdqa %xmm0, (%rdi)
2063; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2064; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2065; SSE41-NEXT:    pslld $31, %xmm0
2066; SSE41-NEXT:    psrad $31, %xmm0
2067; SSE41-NEXT:    movdqa %xmm0, 224(%rdi)
2068; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2069; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2070; SSE41-NEXT:    pslld $31, %xmm0
2071; SSE41-NEXT:    psrad $31, %xmm0
2072; SSE41-NEXT:    movdqa %xmm0, 240(%rdi)
2073; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2074; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2075; SSE41-NEXT:    pslld $31, %xmm0
2076; SSE41-NEXT:    psrad $31, %xmm0
2077; SSE41-NEXT:    movdqa %xmm0, 208(%rdi)
2078; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2079; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2080; SSE41-NEXT:    pslld $31, %xmm0
2081; SSE41-NEXT:    psrad $31, %xmm0
2082; SSE41-NEXT:    movdqa %xmm0, 160(%rdi)
2083; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2084; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2085; SSE41-NEXT:    pslld $31, %xmm0
2086; SSE41-NEXT:    psrad $31, %xmm0
2087; SSE41-NEXT:    movdqa %xmm0, 176(%rdi)
2088; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2089; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2090; SSE41-NEXT:    pslld $31, %xmm0
2091; SSE41-NEXT:    psrad $31, %xmm0
2092; SSE41-NEXT:    movdqa %xmm0, 144(%rdi)
2093; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2094; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2095; SSE41-NEXT:    pslld $31, %xmm0
2096; SSE41-NEXT:    psrad $31, %xmm0
2097; SSE41-NEXT:    movdqa %xmm0, 96(%rdi)
2098; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
2099; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2100; SSE41-NEXT:    pslld $31, %xmm0
2101; SSE41-NEXT:    psrad $31, %xmm0
2102; SSE41-NEXT:    movdqa %xmm0, 112(%rdi)
2103; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2104; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2105; SSE41-NEXT:    pslld $31, %xmm0
2106; SSE41-NEXT:    psrad $31, %xmm0
2107; SSE41-NEXT:    movdqa %xmm0, 80(%rdi)
2108; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3]
2109; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2110; SSE41-NEXT:    pslld $31, %xmm0
2111; SSE41-NEXT:    psrad $31, %xmm0
2112; SSE41-NEXT:    movdqa %xmm0, 32(%rdi)
2113; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
2114; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2115; SSE41-NEXT:    pslld $31, %xmm0
2116; SSE41-NEXT:    psrad $31, %xmm0
2117; SSE41-NEXT:    movdqa %xmm0, 48(%rdi)
2118; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
2119; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
2120; SSE41-NEXT:    pslld $31, %xmm0
2121; SSE41-NEXT:    psrad $31, %xmm0
2122; SSE41-NEXT:    movdqa %xmm0, 16(%rdi)
2123; SSE41-NEXT:    retq
2124;
2125; AVX1-LABEL: umulo_v64i8:
2126; AVX1:       # %bb.0:
2127; AVX1-NEXT:    movq %rdi, %rax
2128; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
2129; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2130; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2131; AVX1-NEXT:    vpmullw %xmm4, %xmm6, %xmm9
2132; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
2133; AVX1-NEXT:    vpand %xmm6, %xmm9, %xmm8
2134; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2135; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2136; AVX1-NEXT:    vpmullw %xmm7, %xmm4, %xmm11
2137; AVX1-NEXT:    vpand %xmm6, %xmm11, %xmm4
2138; AVX1-NEXT:    vpackuswb %xmm8, %xmm4, %xmm8
2139; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
2140; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
2141; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
2142; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
2143; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm12
2144; AVX1-NEXT:    vpand %xmm6, %xmm12, %xmm7
2145; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
2146; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
2147; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm13
2148; AVX1-NEXT:    vpand %xmm6, %xmm13, %xmm2
2149; AVX1-NEXT:    vpackuswb %xmm7, %xmm2, %xmm10
2150; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2151; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2152; AVX1-NEXT:    vpmullw %xmm2, %xmm7, %xmm7
2153; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm2
2154; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2155; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2156; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
2157; AVX1-NEXT:    vpand %xmm6, %xmm0, %xmm4
2158; AVX1-NEXT:    vpackuswb %xmm2, %xmm4, %xmm14
2159; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
2160; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
2161; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
2162; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
2163; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
2164; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
2165; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2166; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm3
2167; AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm1
2168; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm4
2169; AVX1-NEXT:    vpackuswb %xmm1, %xmm4, %xmm15
2170; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2171; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
2172; AVX1-NEXT:    vpackuswb %xmm2, %xmm3, %xmm2
2173; AVX1-NEXT:    vpsrlw $8, %xmm7, %xmm3
2174; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2175; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
2176; AVX1-NEXT:    vpsrlw $8, %xmm12, %xmm3
2177; AVX1-NEXT:    vpsrlw $8, %xmm13, %xmm4
2178; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
2179; AVX1-NEXT:    vpsrlw $8, %xmm9, %xmm4
2180; AVX1-NEXT:    vpsrlw $8, %xmm11, %xmm6
2181; AVX1-NEXT:    vpackuswb %xmm4, %xmm6, %xmm4
2182; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm2, %xmm2
2183; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm0, %xmm0
2184; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm3, %xmm3
2185; AVX1-NEXT:    vpcmpeqb %xmm5, %xmm4, %xmm7
2186; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2187; AVX1-NEXT:    vpxor %xmm1, %xmm2, %xmm6
2188; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm4
2189; AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm5
2190; AVX1-NEXT:    vpxor %xmm1, %xmm7, %xmm3
2191; AVX1-NEXT:    vmovdqa %xmm15, 48(%rsi)
2192; AVX1-NEXT:    vmovdqa %xmm14, 32(%rsi)
2193; AVX1-NEXT:    vmovdqa %xmm10, 16(%rsi)
2194; AVX1-NEXT:    vmovdqa %xmm8, (%rsi)
2195; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm0
2196; AVX1-NEXT:    vmovdqa %xmm0, 192(%rdi)
2197; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm0
2198; AVX1-NEXT:    vmovdqa %xmm0, 128(%rdi)
2199; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm0
2200; AVX1-NEXT:    vmovdqa %xmm0, 64(%rdi)
2201; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm0
2202; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
2203; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3]
2204; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2205; AVX1-NEXT:    vmovdqa %xmm0, 224(%rdi)
2206; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
2207; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2208; AVX1-NEXT:    vmovdqa %xmm0, 240(%rdi)
2209; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
2210; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2211; AVX1-NEXT:    vmovdqa %xmm0, 208(%rdi)
2212; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3]
2213; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2214; AVX1-NEXT:    vmovdqa %xmm0, 160(%rdi)
2215; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
2216; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2217; AVX1-NEXT:    vmovdqa %xmm0, 176(%rdi)
2218; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
2219; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2220; AVX1-NEXT:    vmovdqa %xmm0, 144(%rdi)
2221; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
2222; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2223; AVX1-NEXT:    vmovdqa %xmm0, 96(%rdi)
2224; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
2225; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2226; AVX1-NEXT:    vmovdqa %xmm0, 112(%rdi)
2227; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
2228; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2229; AVX1-NEXT:    vmovdqa %xmm0, 80(%rdi)
2230; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
2231; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2232; AVX1-NEXT:    vmovdqa %xmm0, 32(%rdi)
2233; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
2234; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2235; AVX1-NEXT:    vmovdqa %xmm0, 48(%rdi)
2236; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1]
2237; AVX1-NEXT:    vpmovsxbd %xmm0, %xmm0
2238; AVX1-NEXT:    vmovdqa %xmm0, 16(%rdi)
2239; AVX1-NEXT:    vzeroupper
2240; AVX1-NEXT:    retq
2241;
2242; AVX2-LABEL: umulo_v64i8:
2243; AVX2:       # %bb.0:
2244; AVX2-NEXT:    movq %rdi, %rax
2245; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
2246; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31]
2247; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31]
2248; AVX2-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
2249; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2250; AVX2-NEXT:    vpand %ymm6, %ymm5, %ymm7
2251; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23]
2252; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23]
2253; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm2
2254; AVX2-NEXT:    vpand %ymm6, %ymm2, %ymm0
2255; AVX2-NEXT:    vpackuswb %ymm7, %ymm0, %ymm9
2256; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31]
2257; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31]
2258; AVX2-NEXT:    vpmullw %ymm7, %ymm8, %ymm7
2259; AVX2-NEXT:    vpand %ymm6, %ymm7, %ymm8
2260; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
2261; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23]
2262; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
2263; AVX2-NEXT:    vpand %ymm6, %ymm1, %ymm3
2264; AVX2-NEXT:    vpackuswb %ymm8, %ymm3, %ymm8
2265; AVX2-NEXT:    vpsrlw $8, %ymm7, %ymm6
2266; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2267; AVX2-NEXT:    vpackuswb %ymm6, %ymm1, %ymm1
2268; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm1
2269; AVX2-NEXT:    vpcmpeqd %ymm6, %ymm6, %ymm6
2270; AVX2-NEXT:    vpxor %ymm6, %ymm1, %ymm1
2271; AVX2-NEXT:    vpsrlw $8, %ymm5, %ymm5
2272; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
2273; AVX2-NEXT:    vpackuswb %ymm5, %ymm2, %ymm2
2274; AVX2-NEXT:    vpcmpeqb %ymm4, %ymm2, %ymm2
2275; AVX2-NEXT:    vpxor %ymm6, %ymm2, %ymm2
2276; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
2277; AVX2-NEXT:    vpmovsxbd %xmm4, %ymm4
2278; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm5
2279; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
2280; AVX2-NEXT:    vpmovsxbd %xmm6, %ymm6
2281; AVX2-NEXT:    vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3]
2282; AVX2-NEXT:    vpmovsxbd %xmm7, %ymm7
2283; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
2284; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
2285; AVX2-NEXT:    vpmovsxbd %xmm3, %ymm3
2286; AVX2-NEXT:    vpmovsxbd %xmm2, %ymm2
2287; AVX2-NEXT:    vpmovsxbd %xmm5, %ymm5
2288; AVX2-NEXT:    vpmovsxbd %xmm1, %ymm1
2289; AVX2-NEXT:    vpmovsxbd %xmm0, %ymm0
2290; AVX2-NEXT:    vmovdqa %ymm8, 32(%rsi)
2291; AVX2-NEXT:    vmovdqa %ymm9, (%rsi)
2292; AVX2-NEXT:    vmovdqa %ymm0, 192(%rdi)
2293; AVX2-NEXT:    vmovdqa %ymm1, 128(%rdi)
2294; AVX2-NEXT:    vmovdqa %ymm5, 64(%rdi)
2295; AVX2-NEXT:    vmovdqa %ymm2, (%rdi)
2296; AVX2-NEXT:    vmovdqa %ymm3, 224(%rdi)
2297; AVX2-NEXT:    vmovdqa %ymm7, 160(%rdi)
2298; AVX2-NEXT:    vmovdqa %ymm6, 96(%rdi)
2299; AVX2-NEXT:    vmovdqa %ymm4, 32(%rdi)
2300; AVX2-NEXT:    vzeroupper
2301; AVX2-NEXT:    retq
2302;
2303; AVX512F-LABEL: umulo_v64i8:
2304; AVX512F:       # %bb.0:
2305; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
2306; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
2307; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2308; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm5
2309; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm4
2310; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
2311; AVX512F-NEXT:    vpmullw %ymm3, %ymm4, %ymm4
2312; AVX512F-NEXT:    vpsrlw $8, %ymm4, %ymm3
2313; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero
2314; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
2315; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2316; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
2317; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm5
2318; AVX512F-NEXT:    vpsrlw $8, %ymm5, %ymm2
2319; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2320; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k2
2321; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
2322; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
2323; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
2324; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
2325; AVX512F-NEXT:    vpmullw %ymm2, %ymm3, %ymm6
2326; AVX512F-NEXT:    vpsrlw $8, %ymm6, %ymm2
2327; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero
2328; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k3
2329; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2330; AVX512F-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2331; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm7
2332; AVX512F-NEXT:    vpsrlw $8, %ymm7, %ymm0
2333; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
2334; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k4
2335; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
2336; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
2337; AVX512F-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
2338; AVX512F-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
2339; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero
2340; AVX512F-NEXT:    vpmovdb %zmm4, 48(%rdi)
2341; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero
2342; AVX512F-NEXT:    vpmovdb %zmm4, 32(%rdi)
2343; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero
2344; AVX512F-NEXT:    vpmovdb %zmm4, 16(%rdi)
2345; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero
2346; AVX512F-NEXT:    vpmovdb %zmm4, (%rdi)
2347; AVX512F-NEXT:    retq
2348;
2349; AVX512BW-LABEL: umulo_v64i8:
2350; AVX512BW:       # %bb.0:
2351; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2352; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
2353; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63]
2354; AVX512BW-NEXT:    vpmullw %zmm3, %zmm4, %zmm3
2355; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
2356; AVX512BW-NEXT:    vpandq %zmm4, %zmm3, %zmm5
2357; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
2358; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55]
2359; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
2360; AVX512BW-NEXT:    vpandq %zmm4, %zmm0, %zmm1
2361; AVX512BW-NEXT:    vpackuswb %zmm5, %zmm1, %zmm4
2362; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm1
2363; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
2364; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
2365; AVX512BW-NEXT:    vptestmb %zmm0, %zmm0, %k1
2366; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
2367; AVX512BW-NEXT:    kshiftrd $16, %k1, %k2
2368; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
2369; AVX512BW-NEXT:    kshiftrq $32, %k1, %k1
2370; AVX512BW-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
2371; AVX512BW-NEXT:    kshiftrd $16, %k1, %k1
2372; AVX512BW-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
2373; AVX512BW-NEXT:    vmovdqa64 %zmm4, (%rdi)
2374; AVX512BW-NEXT:    retq
2375  %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1)
2376  %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0
2377  %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1
2378  %res = sext <64 x i1> %obit to <64 x i32>
2379  store <64 x i8> %val, <64 x i8>* %p2
2380  ret <64 x i32> %res
2381}
2382
2383define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind {
2384; SSE2-LABEL: umulo_v8i16:
2385; SSE2:       # %bb.0:
2386; SSE2-NEXT:    movdqa %xmm0, %xmm2
2387; SSE2-NEXT:    pmullw %xmm1, %xmm2
2388; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
2389; SSE2-NEXT:    pxor %xmm0, %xmm0
2390; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
2391; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
2392; SSE2-NEXT:    pxor %xmm0, %xmm1
2393; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2394; SSE2-NEXT:    psrad $16, %xmm0
2395; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2396; SSE2-NEXT:    pslld $31, %xmm1
2397; SSE2-NEXT:    psrad $31, %xmm1
2398; SSE2-NEXT:    movdqa %xmm2, (%rdi)
2399; SSE2-NEXT:    retq
2400;
2401; SSSE3-LABEL: umulo_v8i16:
2402; SSSE3:       # %bb.0:
2403; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2404; SSSE3-NEXT:    pmullw %xmm1, %xmm2
2405; SSSE3-NEXT:    pmulhuw %xmm0, %xmm1
2406; SSSE3-NEXT:    pxor %xmm0, %xmm0
2407; SSSE3-NEXT:    pcmpeqw %xmm0, %xmm1
2408; SSSE3-NEXT:    pcmpeqd %xmm0, %xmm0
2409; SSSE3-NEXT:    pxor %xmm0, %xmm1
2410; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2411; SSSE3-NEXT:    psrad $16, %xmm0
2412; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2413; SSSE3-NEXT:    pslld $31, %xmm1
2414; SSSE3-NEXT:    psrad $31, %xmm1
2415; SSSE3-NEXT:    movdqa %xmm2, (%rdi)
2416; SSSE3-NEXT:    retq
2417;
2418; SSE41-LABEL: umulo_v8i16:
2419; SSE41:       # %bb.0:
2420; SSE41-NEXT:    movdqa %xmm0, %xmm2
2421; SSE41-NEXT:    pmullw %xmm1, %xmm2
2422; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
2423; SSE41-NEXT:    pxor %xmm0, %xmm0
2424; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
2425; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
2426; SSE41-NEXT:    pxor %xmm0, %xmm1
2427; SSE41-NEXT:    pmovsxwd %xmm1, %xmm0
2428; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
2429; SSE41-NEXT:    pslld $31, %xmm1
2430; SSE41-NEXT:    psrad $31, %xmm1
2431; SSE41-NEXT:    movdqa %xmm2, (%rdi)
2432; SSE41-NEXT:    retq
2433;
2434; AVX1-LABEL: umulo_v8i16:
2435; AVX1:       # %bb.0:
2436; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2437; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2438; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2439; AVX1-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2440; AVX1-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2441; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2442; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm1
2443; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
2444; AVX1-NEXT:    vpmovsxwd %xmm0, %xmm0
2445; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2446; AVX1-NEXT:    vmovdqa %xmm2, (%rdi)
2447; AVX1-NEXT:    retq
2448;
2449; AVX2-LABEL: umulo_v8i16:
2450; AVX2:       # %bb.0:
2451; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2452; AVX2-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2453; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2454; AVX2-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2455; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
2456; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2457; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
2458; AVX2-NEXT:    vmovdqa %xmm2, (%rdi)
2459; AVX2-NEXT:    retq
2460;
2461; AVX512F-LABEL: umulo_v8i16:
2462; AVX512F:       # %bb.0:
2463; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2464; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2465; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2466; AVX512F-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
2467; AVX512F-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
2468; AVX512F-NEXT:    vpmovsxwd %xmm0, %ymm0
2469; AVX512F-NEXT:    vptestmd %ymm0, %ymm0, %k1
2470; AVX512F-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2471; AVX512F-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2472; AVX512F-NEXT:    vmovdqa %xmm2, (%rdi)
2473; AVX512F-NEXT:    retq
2474;
2475; AVX512BW-LABEL: umulo_v8i16:
2476; AVX512BW:       # %bb.0:
2477; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm2
2478; AVX512BW-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0
2479; AVX512BW-NEXT:    vptestmw %xmm0, %xmm0, %k1
2480; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
2481; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
2482; AVX512BW-NEXT:    vmovdqa %xmm2, (%rdi)
2483; AVX512BW-NEXT:    retq
2484  %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1)
2485  %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0
2486  %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1
2487  %res = sext <8 x i1> %obit to <8 x i32>
2488  store <8 x i16> %val, <8 x i16>* %p2
2489  ret <8 x i32> %res
2490}
2491
2492define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind {
2493; SSE2-LABEL: umulo_v2i64:
2494; SSE2:       # %bb.0:
2495; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2496; SSE2-NEXT:    movq %xmm2, %r8
2497; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2498; SSE2-NEXT:    movq %xmm2, %r10
2499; SSE2-NEXT:    movq %xmm0, %rax
2500; SSE2-NEXT:    movq %xmm1, %rdx
2501; SSE2-NEXT:    xorl %ecx, %ecx
2502; SSE2-NEXT:    mulq %rdx
2503; SSE2-NEXT:    movq $-1, %r9
2504; SSE2-NEXT:    movl $0, %esi
2505; SSE2-NEXT:    cmovoq %r9, %rsi
2506; SSE2-NEXT:    movq %rax, %xmm1
2507; SSE2-NEXT:    movq %r8, %rax
2508; SSE2-NEXT:    mulq %r10
2509; SSE2-NEXT:    movq %rax, %xmm0
2510; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2511; SSE2-NEXT:    movq %rsi, %xmm0
2512; SSE2-NEXT:    cmovoq %r9, %rcx
2513; SSE2-NEXT:    movq %rcx, %xmm2
2514; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2515; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2516; SSE2-NEXT:    movdqa %xmm1, (%rdi)
2517; SSE2-NEXT:    retq
2518;
2519; SSSE3-LABEL: umulo_v2i64:
2520; SSSE3:       # %bb.0:
2521; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
2522; SSSE3-NEXT:    movq %xmm2, %r8
2523; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
2524; SSSE3-NEXT:    movq %xmm2, %r10
2525; SSSE3-NEXT:    movq %xmm0, %rax
2526; SSSE3-NEXT:    movq %xmm1, %rdx
2527; SSSE3-NEXT:    xorl %ecx, %ecx
2528; SSSE3-NEXT:    mulq %rdx
2529; SSSE3-NEXT:    movq $-1, %r9
2530; SSSE3-NEXT:    movl $0, %esi
2531; SSSE3-NEXT:    cmovoq %r9, %rsi
2532; SSSE3-NEXT:    movq %rax, %xmm1
2533; SSSE3-NEXT:    movq %r8, %rax
2534; SSSE3-NEXT:    mulq %r10
2535; SSSE3-NEXT:    movq %rax, %xmm0
2536; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2537; SSSE3-NEXT:    movq %rsi, %xmm0
2538; SSSE3-NEXT:    cmovoq %r9, %rcx
2539; SSSE3-NEXT:    movq %rcx, %xmm2
2540; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
2541; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2542; SSSE3-NEXT:    movdqa %xmm1, (%rdi)
2543; SSSE3-NEXT:    retq
2544;
2545; SSE41-LABEL: umulo_v2i64:
2546; SSE41:       # %bb.0:
2547; SSE41-NEXT:    movq %xmm0, %r10
2548; SSE41-NEXT:    movq %xmm1, %r8
2549; SSE41-NEXT:    pextrq $1, %xmm0, %rax
2550; SSE41-NEXT:    pextrq $1, %xmm1, %rdx
2551; SSE41-NEXT:    xorl %esi, %esi
2552; SSE41-NEXT:    mulq %rdx
2553; SSE41-NEXT:    movq $-1, %r9
2554; SSE41-NEXT:    movl $0, %ecx
2555; SSE41-NEXT:    cmovoq %r9, %rcx
2556; SSE41-NEXT:    movq %rax, %xmm0
2557; SSE41-NEXT:    movq %r10, %rax
2558; SSE41-NEXT:    mulq %r8
2559; SSE41-NEXT:    movq %rax, %xmm1
2560; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2561; SSE41-NEXT:    movq %rcx, %xmm0
2562; SSE41-NEXT:    cmovoq %r9, %rsi
2563; SSE41-NEXT:    movq %rsi, %xmm2
2564; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2565; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2566; SSE41-NEXT:    movdqa %xmm1, (%rdi)
2567; SSE41-NEXT:    retq
2568;
2569; AVX-LABEL: umulo_v2i64:
2570; AVX:       # %bb.0:
2571; AVX-NEXT:    vmovq %xmm0, %r10
2572; AVX-NEXT:    vmovq %xmm1, %r8
2573; AVX-NEXT:    vpextrq $1, %xmm0, %rax
2574; AVX-NEXT:    vpextrq $1, %xmm1, %rdx
2575; AVX-NEXT:    xorl %esi, %esi
2576; AVX-NEXT:    mulq %rdx
2577; AVX-NEXT:    movq $-1, %r9
2578; AVX-NEXT:    movl $0, %ecx
2579; AVX-NEXT:    cmovoq %r9, %rcx
2580; AVX-NEXT:    vmovq %rax, %xmm0
2581; AVX-NEXT:    movq %r10, %rax
2582; AVX-NEXT:    mulq %r8
2583; AVX-NEXT:    vmovq %rax, %xmm1
2584; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2585; AVX-NEXT:    vmovq %rcx, %xmm0
2586; AVX-NEXT:    cmovoq %r9, %rsi
2587; AVX-NEXT:    vmovq %rsi, %xmm2
2588; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
2589; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2590; AVX-NEXT:    vmovdqa %xmm1, (%rdi)
2591; AVX-NEXT:    retq
2592;
2593; AVX512F-LABEL: umulo_v2i64:
2594; AVX512F:       # %bb.0:
2595; AVX512F-NEXT:    vmovq %xmm0, %rcx
2596; AVX512F-NEXT:    vmovq %xmm1, %rsi
2597; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
2598; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdx
2599; AVX512F-NEXT:    mulq %rdx
2600; AVX512F-NEXT:    seto %r8b
2601; AVX512F-NEXT:    vmovq %rax, %xmm0
2602; AVX512F-NEXT:    movq %rcx, %rax
2603; AVX512F-NEXT:    mulq %rsi
2604; AVX512F-NEXT:    vmovq %rax, %xmm1
2605; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2606; AVX512F-NEXT:    seto %al
2607; AVX512F-NEXT:    movw $-3, %cx
2608; AVX512F-NEXT:    kmovw %ecx, %k0
2609; AVX512F-NEXT:    kmovw %eax, %k1
2610; AVX512F-NEXT:    kandw %k0, %k1, %k0
2611; AVX512F-NEXT:    kmovw %r8d, %k1
2612; AVX512F-NEXT:    kshiftlw $15, %k1, %k1
2613; AVX512F-NEXT:    kshiftrw $14, %k1, %k1
2614; AVX512F-NEXT:    korw %k1, %k0, %k1
2615; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2616; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2617; AVX512F-NEXT:    vmovdqa %xmm1, (%rdi)
2618; AVX512F-NEXT:    retq
2619;
2620; AVX512BW-LABEL: umulo_v2i64:
2621; AVX512BW:       # %bb.0:
2622; AVX512BW-NEXT:    vmovq %xmm0, %rcx
2623; AVX512BW-NEXT:    vmovq %xmm1, %rsi
2624; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
2625; AVX512BW-NEXT:    vpextrq $1, %xmm1, %rdx
2626; AVX512BW-NEXT:    mulq %rdx
2627; AVX512BW-NEXT:    seto %r8b
2628; AVX512BW-NEXT:    vmovq %rax, %xmm0
2629; AVX512BW-NEXT:    movq %rcx, %rax
2630; AVX512BW-NEXT:    mulq %rsi
2631; AVX512BW-NEXT:    vmovq %rax, %xmm1
2632; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2633; AVX512BW-NEXT:    seto %al
2634; AVX512BW-NEXT:    movw $-3, %cx
2635; AVX512BW-NEXT:    kmovd %ecx, %k0
2636; AVX512BW-NEXT:    kmovd %eax, %k1
2637; AVX512BW-NEXT:    kandw %k0, %k1, %k0
2638; AVX512BW-NEXT:    kmovd %r8d, %k1
2639; AVX512BW-NEXT:    kshiftlw $15, %k1, %k1
2640; AVX512BW-NEXT:    kshiftrw $14, %k1, %k1
2641; AVX512BW-NEXT:    korw %k1, %k0, %k1
2642; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2643; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2644; AVX512BW-NEXT:    vmovdqa %xmm1, (%rdi)
2645; AVX512BW-NEXT:    retq
2646  %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1)
2647  %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0
2648  %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1
2649  %res = sext <2 x i1> %obit to <2 x i32>
2650  store <2 x i64> %val, <2 x i64>* %p2
2651  ret <2 x i32> %res
2652}
2653
2654define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind {
2655; SSE2-LABEL: umulo_v4i24:
2656; SSE2:       # %bb.0:
2657; SSE2-NEXT:    movdqa %xmm0, %xmm2
2658; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2659; SSE2-NEXT:    pand %xmm0, %xmm1
2660; SSE2-NEXT:    pand %xmm0, %xmm2
2661; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2662; SSE2-NEXT:    pmuludq %xmm1, %xmm2
2663; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2664; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2665; SSE2-NEXT:    pmuludq %xmm0, %xmm1
2666; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2667; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2668; SSE2-NEXT:    pxor %xmm4, %xmm4
2669; SSE2-NEXT:    pcmpeqd %xmm4, %xmm3
2670; SSE2-NEXT:    pcmpeqd %xmm5, %xmm5
2671; SSE2-NEXT:    pxor %xmm3, %xmm5
2672; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2673; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2674; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2675; SSE2-NEXT:    psrld $24, %xmm0
2676; SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
2677; SSE2-NEXT:    por %xmm5, %xmm0
2678; SSE2-NEXT:    movd %xmm2, %eax
2679; SSE2-NEXT:    movw %ax, (%rdi)
2680; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2681; SSE2-NEXT:    movd %xmm2, %ecx
2682; SSE2-NEXT:    movw %cx, 6(%rdi)
2683; SSE2-NEXT:    movd %xmm1, %edx
2684; SSE2-NEXT:    movw %dx, 3(%rdi)
2685; SSE2-NEXT:    shrl $16, %eax
2686; SSE2-NEXT:    movb %al, 2(%rdi)
2687; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2688; SSE2-NEXT:    movd %xmm1, %eax
2689; SSE2-NEXT:    movw %ax, 9(%rdi)
2690; SSE2-NEXT:    shrl $16, %ecx
2691; SSE2-NEXT:    movb %cl, 8(%rdi)
2692; SSE2-NEXT:    shrl $16, %edx
2693; SSE2-NEXT:    movb %dl, 5(%rdi)
2694; SSE2-NEXT:    shrl $16, %eax
2695; SSE2-NEXT:    movb %al, 11(%rdi)
2696; SSE2-NEXT:    retq
2697;
2698; SSSE3-LABEL: umulo_v4i24:
2699; SSSE3:       # %bb.0:
2700; SSSE3-NEXT:    movdqa %xmm0, %xmm2
2701; SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2702; SSSE3-NEXT:    pand %xmm0, %xmm1
2703; SSSE3-NEXT:    pand %xmm0, %xmm2
2704; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
2705; SSSE3-NEXT:    pmuludq %xmm1, %xmm2
2706; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3]
2707; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
2708; SSSE3-NEXT:    pmuludq %xmm0, %xmm1
2709; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
2710; SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2711; SSSE3-NEXT:    pxor %xmm4, %xmm4
2712; SSSE3-NEXT:    pcmpeqd %xmm4, %xmm3
2713; SSSE3-NEXT:    pcmpeqd %xmm5, %xmm5
2714; SSSE3-NEXT:    pxor %xmm3, %xmm5
2715; SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
2716; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
2717; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
2718; SSSE3-NEXT:    psrld $24, %xmm0
2719; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm0
2720; SSSE3-NEXT:    por %xmm5, %xmm0
2721; SSSE3-NEXT:    movd %xmm2, %eax
2722; SSSE3-NEXT:    movw %ax, (%rdi)
2723; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
2724; SSSE3-NEXT:    movd %xmm2, %ecx
2725; SSSE3-NEXT:    movw %cx, 6(%rdi)
2726; SSSE3-NEXT:    movd %xmm1, %edx
2727; SSSE3-NEXT:    movw %dx, 3(%rdi)
2728; SSSE3-NEXT:    shrl $16, %eax
2729; SSSE3-NEXT:    movb %al, 2(%rdi)
2730; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
2731; SSSE3-NEXT:    movd %xmm1, %eax
2732; SSSE3-NEXT:    movw %ax, 9(%rdi)
2733; SSSE3-NEXT:    shrl $16, %ecx
2734; SSSE3-NEXT:    movb %cl, 8(%rdi)
2735; SSSE3-NEXT:    shrl $16, %edx
2736; SSSE3-NEXT:    movb %dl, 5(%rdi)
2737; SSSE3-NEXT:    shrl $16, %eax
2738; SSSE3-NEXT:    movb %al, 11(%rdi)
2739; SSSE3-NEXT:    retq
2740;
2741; SSE41-LABEL: umulo_v4i24:
2742; SSE41:       # %bb.0:
2743; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
2744; SSE41-NEXT:    pand %xmm2, %xmm0
2745; SSE41-NEXT:    pand %xmm2, %xmm1
2746; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2747; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2748; SSE41-NEXT:    pmuludq %xmm2, %xmm3
2749; SSE41-NEXT:    movdqa %xmm0, %xmm2
2750; SSE41-NEXT:    pmuludq %xmm1, %xmm2
2751; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
2752; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
2753; SSE41-NEXT:    pxor %xmm3, %xmm3
2754; SSE41-NEXT:    pcmpeqd %xmm3, %xmm2
2755; SSE41-NEXT:    pcmpeqd %xmm4, %xmm4
2756; SSE41-NEXT:    pxor %xmm2, %xmm4
2757; SSE41-NEXT:    pmulld %xmm0, %xmm1
2758; SSE41-NEXT:    pextrd $3, %xmm1, %eax
2759; SSE41-NEXT:    pextrd $2, %xmm1, %ecx
2760; SSE41-NEXT:    pextrd $1, %xmm1, %edx
2761; SSE41-NEXT:    movd %xmm1, %esi
2762; SSE41-NEXT:    movdqa %xmm1, %xmm0
2763; SSE41-NEXT:    psrld $24, %xmm0
2764; SSE41-NEXT:    pcmpgtd %xmm3, %xmm0
2765; SSE41-NEXT:    por %xmm4, %xmm0
2766; SSE41-NEXT:    movw %ax, 9(%rdi)
2767; SSE41-NEXT:    movw %cx, 6(%rdi)
2768; SSE41-NEXT:    movw %dx, 3(%rdi)
2769; SSE41-NEXT:    movw %si, (%rdi)
2770; SSE41-NEXT:    shrl $16, %eax
2771; SSE41-NEXT:    movb %al, 11(%rdi)
2772; SSE41-NEXT:    shrl $16, %ecx
2773; SSE41-NEXT:    movb %cl, 8(%rdi)
2774; SSE41-NEXT:    shrl $16, %edx
2775; SSE41-NEXT:    movb %dl, 5(%rdi)
2776; SSE41-NEXT:    shrl $16, %esi
2777; SSE41-NEXT:    movb %sil, 2(%rdi)
2778; SSE41-NEXT:    retq
2779;
2780; AVX1-LABEL: umulo_v4i24:
2781; AVX1:       # %bb.0:
2782; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2783; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm0
2784; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm1
2785; AVX1-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3]
2786; AVX1-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3]
2787; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2788; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2789; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2790; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
2791; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2792; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2793; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2794; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2795; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2796; AVX1-NEXT:    vpsrld $24, %xmm1, %xmm0
2797; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2798; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
2799; AVX1-NEXT:    vpextrd $3, %xmm1, %eax
2800; AVX1-NEXT:    movw %ax, 9(%rdi)
2801; AVX1-NEXT:    vpextrd $2, %xmm1, %ecx
2802; AVX1-NEXT:    movw %cx, 6(%rdi)
2803; AVX1-NEXT:    vpextrd $1, %xmm1, %edx
2804; AVX1-NEXT:    movw %dx, 3(%rdi)
2805; AVX1-NEXT:    vmovd %xmm1, %esi
2806; AVX1-NEXT:    movw %si, (%rdi)
2807; AVX1-NEXT:    shrl $16, %eax
2808; AVX1-NEXT:    movb %al, 11(%rdi)
2809; AVX1-NEXT:    shrl $16, %ecx
2810; AVX1-NEXT:    movb %cl, 8(%rdi)
2811; AVX1-NEXT:    shrl $16, %edx
2812; AVX1-NEXT:    movb %dl, 5(%rdi)
2813; AVX1-NEXT:    shrl $16, %esi
2814; AVX1-NEXT:    movb %sil, 2(%rdi)
2815; AVX1-NEXT:    retq
2816;
2817; AVX2-LABEL: umulo_v4i24:
2818; AVX2:       # %bb.0:
2819; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2820; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
2821; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
2822; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
2823; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
2824; AVX2-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
2825; AVX2-NEXT:    vpmuludq %xmm1, %xmm0, %xmm3
2826; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
2827; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
2828; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
2829; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
2830; AVX2-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
2831; AVX2-NEXT:    vpxor %xmm4, %xmm2, %xmm2
2832; AVX2-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2833; AVX2-NEXT:    vpsrld $24, %xmm1, %xmm0
2834; AVX2-NEXT:    vpcmpgtd %xmm3, %xmm0, %xmm0
2835; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
2836; AVX2-NEXT:    vpextrd $3, %xmm1, %eax
2837; AVX2-NEXT:    movw %ax, 9(%rdi)
2838; AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
2839; AVX2-NEXT:    movw %cx, 6(%rdi)
2840; AVX2-NEXT:    vpextrd $1, %xmm1, %edx
2841; AVX2-NEXT:    movw %dx, 3(%rdi)
2842; AVX2-NEXT:    vmovd %xmm1, %esi
2843; AVX2-NEXT:    movw %si, (%rdi)
2844; AVX2-NEXT:    shrl $16, %eax
2845; AVX2-NEXT:    movb %al, 11(%rdi)
2846; AVX2-NEXT:    shrl $16, %ecx
2847; AVX2-NEXT:    movb %cl, 8(%rdi)
2848; AVX2-NEXT:    shrl $16, %edx
2849; AVX2-NEXT:    movb %dl, 5(%rdi)
2850; AVX2-NEXT:    shrl $16, %esi
2851; AVX2-NEXT:    movb %sil, 2(%rdi)
2852; AVX2-NEXT:    retq
2853;
2854; AVX512-LABEL: umulo_v4i24:
2855; AVX512:       # %bb.0:
2856; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215]
2857; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
2858; AVX512-NEXT:    vpand %xmm2, %xmm0, %xmm0
2859; AVX512-NEXT:    vpmuludq %xmm1, %xmm0, %xmm2
2860; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
2861; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
2862; AVX512-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
2863; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,3,7]
2864; AVX512-NEXT:    vpermi2d %xmm3, %xmm2, %xmm4
2865; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm1
2866; AVX512-NEXT:    vpsrld $24, %xmm1, %xmm0
2867; AVX512-NEXT:    vpor %xmm4, %xmm0, %xmm0
2868; AVX512-NEXT:    vptestmd %xmm0, %xmm0, %k1
2869; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
2870; AVX512-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
2871; AVX512-NEXT:    vpextrd $3, %xmm1, %eax
2872; AVX512-NEXT:    movw %ax, 9(%rdi)
2873; AVX512-NEXT:    vpextrd $2, %xmm1, %ecx
2874; AVX512-NEXT:    movw %cx, 6(%rdi)
2875; AVX512-NEXT:    vpextrd $1, %xmm1, %edx
2876; AVX512-NEXT:    movw %dx, 3(%rdi)
2877; AVX512-NEXT:    vmovd %xmm1, %esi
2878; AVX512-NEXT:    movw %si, (%rdi)
2879; AVX512-NEXT:    shrl $16, %eax
2880; AVX512-NEXT:    movb %al, 11(%rdi)
2881; AVX512-NEXT:    shrl $16, %ecx
2882; AVX512-NEXT:    movb %cl, 8(%rdi)
2883; AVX512-NEXT:    shrl $16, %edx
2884; AVX512-NEXT:    movb %dl, 5(%rdi)
2885; AVX512-NEXT:    shrl $16, %esi
2886; AVX512-NEXT:    movb %sil, 2(%rdi)
2887; AVX512-NEXT:    retq
2888  %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1)
2889  %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0
2890  %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1
2891  %res = sext <4 x i1> %obit to <4 x i32>
2892  store <4 x i24> %val, <4 x i24>* %p2
2893  ret <4 x i32> %res
2894}
2895
2896define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind {
2897; SSE-LABEL: umulo_v4i1:
2898; SSE:       # %bb.0:
2899; SSE-NEXT:    pand %xmm1, %xmm0
2900; SSE-NEXT:    pslld $31, %xmm0
2901; SSE-NEXT:    movmskps %xmm0, %eax
2902; SSE-NEXT:    movb %al, (%rdi)
2903; SSE-NEXT:    xorps %xmm0, %xmm0
2904; SSE-NEXT:    retq
2905;
2906; AVX-LABEL: umulo_v4i1:
2907; AVX:       # %bb.0:
2908; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
2909; AVX-NEXT:    vpslld $31, %xmm0, %xmm0
2910; AVX-NEXT:    vmovmskps %xmm0, %eax
2911; AVX-NEXT:    movb %al, (%rdi)
2912; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2913; AVX-NEXT:    retq
2914;
2915; AVX512F-LABEL: umulo_v4i1:
2916; AVX512F:       # %bb.0:
2917; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
2918; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
2919; AVX512F-NEXT:    vptestmd %xmm0, %xmm0, %k0
2920; AVX512F-NEXT:    kmovw %k0, %eax
2921; AVX512F-NEXT:    movb %al, (%rdi)
2922; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2923; AVX512F-NEXT:    retq
2924;
2925; AVX512BW-LABEL: umulo_v4i1:
2926; AVX512BW:       # %bb.0:
2927; AVX512BW-NEXT:    vpand %xmm1, %xmm0, %xmm0
2928; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
2929; AVX512BW-NEXT:    vptestmd %xmm0, %xmm0, %k0
2930; AVX512BW-NEXT:    kmovd %k0, %eax
2931; AVX512BW-NEXT:    movb %al, (%rdi)
2932; AVX512BW-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2933; AVX512BW-NEXT:    retq
2934  %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
2935  %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0
2936  %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1
2937  %res = sext <4 x i1> %obit to <4 x i32>
2938  store <4 x i1> %val, <4 x i1>* %p2
2939  ret <4 x i32> %res
2940}
2941
2942define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind {
2943; SSE2-LABEL: umulo_v2i128:
2944; SSE2:       # %bb.0:
2945; SSE2-NEXT:    pushq %rbp
2946; SSE2-NEXT:    pushq %r15
2947; SSE2-NEXT:    pushq %r14
2948; SSE2-NEXT:    pushq %r13
2949; SSE2-NEXT:    pushq %r12
2950; SSE2-NEXT:    pushq %rbx
2951; SSE2-NEXT:    movq %r9, %r10
2952; SSE2-NEXT:    movq %rcx, %r12
2953; SSE2-NEXT:    movq %rdx, %r11
2954; SSE2-NEXT:    movq %rsi, %rax
2955; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
2956; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
2957; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
2958; SSE2-NEXT:    testq %r10, %r10
2959; SSE2-NEXT:    setne %cl
2960; SSE2-NEXT:    testq %rsi, %rsi
2961; SSE2-NEXT:    setne %r13b
2962; SSE2-NEXT:    andb %cl, %r13b
2963; SSE2-NEXT:    mulq %r8
2964; SSE2-NEXT:    movq %rax, %rsi
2965; SSE2-NEXT:    seto %bpl
2966; SSE2-NEXT:    movq %r10, %rax
2967; SSE2-NEXT:    mulq %rdi
2968; SSE2-NEXT:    movq %rax, %rcx
2969; SSE2-NEXT:    seto %bl
2970; SSE2-NEXT:    orb %bpl, %bl
2971; SSE2-NEXT:    addq %rsi, %rcx
2972; SSE2-NEXT:    movq %rdi, %rax
2973; SSE2-NEXT:    mulq %r8
2974; SSE2-NEXT:    movq %rax, %r8
2975; SSE2-NEXT:    movq %rdx, %rsi
2976; SSE2-NEXT:    addq %rcx, %rsi
2977; SSE2-NEXT:    setb %cl
2978; SSE2-NEXT:    orb %bl, %cl
2979; SSE2-NEXT:    orb %r13b, %cl
2980; SSE2-NEXT:    testq %r9, %r9
2981; SSE2-NEXT:    setne %al
2982; SSE2-NEXT:    testq %r12, %r12
2983; SSE2-NEXT:    setne %r10b
2984; SSE2-NEXT:    andb %al, %r10b
2985; SSE2-NEXT:    movq %r12, %rax
2986; SSE2-NEXT:    mulq %r15
2987; SSE2-NEXT:    movq %rax, %rdi
2988; SSE2-NEXT:    seto %bpl
2989; SSE2-NEXT:    movq %r9, %rax
2990; SSE2-NEXT:    mulq %r11
2991; SSE2-NEXT:    movq %rax, %rbx
2992; SSE2-NEXT:    seto %r9b
2993; SSE2-NEXT:    orb %bpl, %r9b
2994; SSE2-NEXT:    addq %rdi, %rbx
2995; SSE2-NEXT:    movq %r11, %rax
2996; SSE2-NEXT:    mulq %r15
2997; SSE2-NEXT:    addq %rbx, %rdx
2998; SSE2-NEXT:    setb %bl
2999; SSE2-NEXT:    orb %r9b, %bl
3000; SSE2-NEXT:    orb %r10b, %bl
3001; SSE2-NEXT:    movzbl %bl, %edi
3002; SSE2-NEXT:    negl %edi
3003; SSE2-NEXT:    movd %edi, %xmm1
3004; SSE2-NEXT:    movzbl %cl, %ecx
3005; SSE2-NEXT:    negl %ecx
3006; SSE2-NEXT:    movd %ecx, %xmm0
3007; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3008; SSE2-NEXT:    movq %rax, 16(%r14)
3009; SSE2-NEXT:    movq %r8, (%r14)
3010; SSE2-NEXT:    movq %rdx, 24(%r14)
3011; SSE2-NEXT:    movq %rsi, 8(%r14)
3012; SSE2-NEXT:    popq %rbx
3013; SSE2-NEXT:    popq %r12
3014; SSE2-NEXT:    popq %r13
3015; SSE2-NEXT:    popq %r14
3016; SSE2-NEXT:    popq %r15
3017; SSE2-NEXT:    popq %rbp
3018; SSE2-NEXT:    retq
3019;
3020; SSSE3-LABEL: umulo_v2i128:
3021; SSSE3:       # %bb.0:
3022; SSSE3-NEXT:    pushq %rbp
3023; SSSE3-NEXT:    pushq %r15
3024; SSSE3-NEXT:    pushq %r14
3025; SSSE3-NEXT:    pushq %r13
3026; SSSE3-NEXT:    pushq %r12
3027; SSSE3-NEXT:    pushq %rbx
3028; SSSE3-NEXT:    movq %r9, %r10
3029; SSSE3-NEXT:    movq %rcx, %r12
3030; SSSE3-NEXT:    movq %rdx, %r11
3031; SSSE3-NEXT:    movq %rsi, %rax
3032; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3033; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3034; SSSE3-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3035; SSSE3-NEXT:    testq %r10, %r10
3036; SSSE3-NEXT:    setne %cl
3037; SSSE3-NEXT:    testq %rsi, %rsi
3038; SSSE3-NEXT:    setne %r13b
3039; SSSE3-NEXT:    andb %cl, %r13b
3040; SSSE3-NEXT:    mulq %r8
3041; SSSE3-NEXT:    movq %rax, %rsi
3042; SSSE3-NEXT:    seto %bpl
3043; SSSE3-NEXT:    movq %r10, %rax
3044; SSSE3-NEXT:    mulq %rdi
3045; SSSE3-NEXT:    movq %rax, %rcx
3046; SSSE3-NEXT:    seto %bl
3047; SSSE3-NEXT:    orb %bpl, %bl
3048; SSSE3-NEXT:    addq %rsi, %rcx
3049; SSSE3-NEXT:    movq %rdi, %rax
3050; SSSE3-NEXT:    mulq %r8
3051; SSSE3-NEXT:    movq %rax, %r8
3052; SSSE3-NEXT:    movq %rdx, %rsi
3053; SSSE3-NEXT:    addq %rcx, %rsi
3054; SSSE3-NEXT:    setb %cl
3055; SSSE3-NEXT:    orb %bl, %cl
3056; SSSE3-NEXT:    orb %r13b, %cl
3057; SSSE3-NEXT:    testq %r9, %r9
3058; SSSE3-NEXT:    setne %al
3059; SSSE3-NEXT:    testq %r12, %r12
3060; SSSE3-NEXT:    setne %r10b
3061; SSSE3-NEXT:    andb %al, %r10b
3062; SSSE3-NEXT:    movq %r12, %rax
3063; SSSE3-NEXT:    mulq %r15
3064; SSSE3-NEXT:    movq %rax, %rdi
3065; SSSE3-NEXT:    seto %bpl
3066; SSSE3-NEXT:    movq %r9, %rax
3067; SSSE3-NEXT:    mulq %r11
3068; SSSE3-NEXT:    movq %rax, %rbx
3069; SSSE3-NEXT:    seto %r9b
3070; SSSE3-NEXT:    orb %bpl, %r9b
3071; SSSE3-NEXT:    addq %rdi, %rbx
3072; SSSE3-NEXT:    movq %r11, %rax
3073; SSSE3-NEXT:    mulq %r15
3074; SSSE3-NEXT:    addq %rbx, %rdx
3075; SSSE3-NEXT:    setb %bl
3076; SSSE3-NEXT:    orb %r9b, %bl
3077; SSSE3-NEXT:    orb %r10b, %bl
3078; SSSE3-NEXT:    movzbl %bl, %edi
3079; SSSE3-NEXT:    negl %edi
3080; SSSE3-NEXT:    movd %edi, %xmm1
3081; SSSE3-NEXT:    movzbl %cl, %ecx
3082; SSSE3-NEXT:    negl %ecx
3083; SSSE3-NEXT:    movd %ecx, %xmm0
3084; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3085; SSSE3-NEXT:    movq %rax, 16(%r14)
3086; SSSE3-NEXT:    movq %r8, (%r14)
3087; SSSE3-NEXT:    movq %rdx, 24(%r14)
3088; SSSE3-NEXT:    movq %rsi, 8(%r14)
3089; SSSE3-NEXT:    popq %rbx
3090; SSSE3-NEXT:    popq %r12
3091; SSSE3-NEXT:    popq %r13
3092; SSSE3-NEXT:    popq %r14
3093; SSSE3-NEXT:    popq %r15
3094; SSSE3-NEXT:    popq %rbp
3095; SSSE3-NEXT:    retq
3096;
3097; SSE41-LABEL: umulo_v2i128:
3098; SSE41:       # %bb.0:
3099; SSE41-NEXT:    pushq %rbp
3100; SSE41-NEXT:    pushq %r15
3101; SSE41-NEXT:    pushq %r14
3102; SSE41-NEXT:    pushq %r13
3103; SSE41-NEXT:    pushq %r12
3104; SSE41-NEXT:    pushq %rbx
3105; SSE41-NEXT:    movq %r9, %r10
3106; SSE41-NEXT:    movq %rcx, %r12
3107; SSE41-NEXT:    movq %rdx, %r11
3108; SSE41-NEXT:    movq %rsi, %rax
3109; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3110; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3111; SSE41-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3112; SSE41-NEXT:    testq %r10, %r10
3113; SSE41-NEXT:    setne %cl
3114; SSE41-NEXT:    testq %rsi, %rsi
3115; SSE41-NEXT:    setne %r13b
3116; SSE41-NEXT:    andb %cl, %r13b
3117; SSE41-NEXT:    mulq %r8
3118; SSE41-NEXT:    movq %rax, %rsi
3119; SSE41-NEXT:    seto %bpl
3120; SSE41-NEXT:    movq %r10, %rax
3121; SSE41-NEXT:    mulq %rdi
3122; SSE41-NEXT:    movq %rax, %rcx
3123; SSE41-NEXT:    seto %bl
3124; SSE41-NEXT:    orb %bpl, %bl
3125; SSE41-NEXT:    addq %rsi, %rcx
3126; SSE41-NEXT:    movq %rdi, %rax
3127; SSE41-NEXT:    mulq %r8
3128; SSE41-NEXT:    movq %rax, %r8
3129; SSE41-NEXT:    movq %rdx, %rsi
3130; SSE41-NEXT:    addq %rcx, %rsi
3131; SSE41-NEXT:    setb %cl
3132; SSE41-NEXT:    orb %bl, %cl
3133; SSE41-NEXT:    orb %r13b, %cl
3134; SSE41-NEXT:    testq %r9, %r9
3135; SSE41-NEXT:    setne %al
3136; SSE41-NEXT:    testq %r12, %r12
3137; SSE41-NEXT:    setne %r10b
3138; SSE41-NEXT:    andb %al, %r10b
3139; SSE41-NEXT:    movq %r12, %rax
3140; SSE41-NEXT:    mulq %r15
3141; SSE41-NEXT:    movq %rax, %rdi
3142; SSE41-NEXT:    seto %bpl
3143; SSE41-NEXT:    movq %r9, %rax
3144; SSE41-NEXT:    mulq %r11
3145; SSE41-NEXT:    movq %rax, %rbx
3146; SSE41-NEXT:    seto %r9b
3147; SSE41-NEXT:    orb %bpl, %r9b
3148; SSE41-NEXT:    addq %rdi, %rbx
3149; SSE41-NEXT:    movq %r11, %rax
3150; SSE41-NEXT:    mulq %r15
3151; SSE41-NEXT:    addq %rbx, %rdx
3152; SSE41-NEXT:    setb %bl
3153; SSE41-NEXT:    orb %r9b, %bl
3154; SSE41-NEXT:    orb %r10b, %bl
3155; SSE41-NEXT:    movzbl %bl, %edi
3156; SSE41-NEXT:    negl %edi
3157; SSE41-NEXT:    movzbl %cl, %ecx
3158; SSE41-NEXT:    negl %ecx
3159; SSE41-NEXT:    movd %ecx, %xmm0
3160; SSE41-NEXT:    pinsrd $1, %edi, %xmm0
3161; SSE41-NEXT:    movq %rax, 16(%r14)
3162; SSE41-NEXT:    movq %r8, (%r14)
3163; SSE41-NEXT:    movq %rdx, 24(%r14)
3164; SSE41-NEXT:    movq %rsi, 8(%r14)
3165; SSE41-NEXT:    popq %rbx
3166; SSE41-NEXT:    popq %r12
3167; SSE41-NEXT:    popq %r13
3168; SSE41-NEXT:    popq %r14
3169; SSE41-NEXT:    popq %r15
3170; SSE41-NEXT:    popq %rbp
3171; SSE41-NEXT:    retq
3172;
3173; AVX-LABEL: umulo_v2i128:
3174; AVX:       # %bb.0:
3175; AVX-NEXT:    pushq %rbp
3176; AVX-NEXT:    pushq %r15
3177; AVX-NEXT:    pushq %r14
3178; AVX-NEXT:    pushq %r13
3179; AVX-NEXT:    pushq %r12
3180; AVX-NEXT:    pushq %rbx
3181; AVX-NEXT:    movq %r9, %r10
3182; AVX-NEXT:    movq %rcx, %r12
3183; AVX-NEXT:    movq %rdx, %r11
3184; AVX-NEXT:    movq %rsi, %rax
3185; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3186; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3187; AVX-NEXT:    movq {{[0-9]+}}(%rsp), %r9
3188; AVX-NEXT:    testq %r10, %r10
3189; AVX-NEXT:    setne %cl
3190; AVX-NEXT:    testq %rsi, %rsi
3191; AVX-NEXT:    setne %r13b
3192; AVX-NEXT:    andb %cl, %r13b
3193; AVX-NEXT:    mulq %r8
3194; AVX-NEXT:    movq %rax, %rsi
3195; AVX-NEXT:    seto %bpl
3196; AVX-NEXT:    movq %r10, %rax
3197; AVX-NEXT:    mulq %rdi
3198; AVX-NEXT:    movq %rax, %rcx
3199; AVX-NEXT:    seto %bl
3200; AVX-NEXT:    orb %bpl, %bl
3201; AVX-NEXT:    addq %rsi, %rcx
3202; AVX-NEXT:    movq %rdi, %rax
3203; AVX-NEXT:    mulq %r8
3204; AVX-NEXT:    movq %rax, %r8
3205; AVX-NEXT:    movq %rdx, %rsi
3206; AVX-NEXT:    addq %rcx, %rsi
3207; AVX-NEXT:    setb %cl
3208; AVX-NEXT:    orb %bl, %cl
3209; AVX-NEXT:    orb %r13b, %cl
3210; AVX-NEXT:    testq %r9, %r9
3211; AVX-NEXT:    setne %al
3212; AVX-NEXT:    testq %r12, %r12
3213; AVX-NEXT:    setne %r10b
3214; AVX-NEXT:    andb %al, %r10b
3215; AVX-NEXT:    movq %r12, %rax
3216; AVX-NEXT:    mulq %r15
3217; AVX-NEXT:    movq %rax, %rdi
3218; AVX-NEXT:    seto %bpl
3219; AVX-NEXT:    movq %r9, %rax
3220; AVX-NEXT:    mulq %r11
3221; AVX-NEXT:    movq %rax, %rbx
3222; AVX-NEXT:    seto %r9b
3223; AVX-NEXT:    orb %bpl, %r9b
3224; AVX-NEXT:    addq %rdi, %rbx
3225; AVX-NEXT:    movq %r11, %rax
3226; AVX-NEXT:    mulq %r15
3227; AVX-NEXT:    addq %rbx, %rdx
3228; AVX-NEXT:    setb %bl
3229; AVX-NEXT:    orb %r9b, %bl
3230; AVX-NEXT:    orb %r10b, %bl
3231; AVX-NEXT:    movzbl %bl, %edi
3232; AVX-NEXT:    negl %edi
3233; AVX-NEXT:    movzbl %cl, %ecx
3234; AVX-NEXT:    negl %ecx
3235; AVX-NEXT:    vmovd %ecx, %xmm0
3236; AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
3237; AVX-NEXT:    movq %rax, 16(%r14)
3238; AVX-NEXT:    movq %r8, (%r14)
3239; AVX-NEXT:    movq %rdx, 24(%r14)
3240; AVX-NEXT:    movq %rsi, 8(%r14)
3241; AVX-NEXT:    popq %rbx
3242; AVX-NEXT:    popq %r12
3243; AVX-NEXT:    popq %r13
3244; AVX-NEXT:    popq %r14
3245; AVX-NEXT:    popq %r15
3246; AVX-NEXT:    popq %rbp
3247; AVX-NEXT:    retq
3248;
3249; AVX512F-LABEL: umulo_v2i128:
3250; AVX512F:       # %bb.0:
3251; AVX512F-NEXT:    pushq %rbp
3252; AVX512F-NEXT:    pushq %r15
3253; AVX512F-NEXT:    pushq %r14
3254; AVX512F-NEXT:    pushq %r13
3255; AVX512F-NEXT:    pushq %r12
3256; AVX512F-NEXT:    pushq %rbx
3257; AVX512F-NEXT:    movq %rcx, %rax
3258; AVX512F-NEXT:    movq %rdx, %r12
3259; AVX512F-NEXT:    movq %rdi, %r11
3260; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3261; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3262; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3263; AVX512F-NEXT:    testq %r10, %r10
3264; AVX512F-NEXT:    setne %dl
3265; AVX512F-NEXT:    testq %rcx, %rcx
3266; AVX512F-NEXT:    setne %r13b
3267; AVX512F-NEXT:    andb %dl, %r13b
3268; AVX512F-NEXT:    mulq %r15
3269; AVX512F-NEXT:    movq %rax, %rdi
3270; AVX512F-NEXT:    seto %bpl
3271; AVX512F-NEXT:    movq %r10, %rax
3272; AVX512F-NEXT:    mulq %r12
3273; AVX512F-NEXT:    movq %rax, %rbx
3274; AVX512F-NEXT:    seto %cl
3275; AVX512F-NEXT:    orb %bpl, %cl
3276; AVX512F-NEXT:    addq %rdi, %rbx
3277; AVX512F-NEXT:    movq %r12, %rax
3278; AVX512F-NEXT:    mulq %r15
3279; AVX512F-NEXT:    movq %rax, %r10
3280; AVX512F-NEXT:    movq %rdx, %r15
3281; AVX512F-NEXT:    addq %rbx, %r15
3282; AVX512F-NEXT:    setb %al
3283; AVX512F-NEXT:    orb %cl, %al
3284; AVX512F-NEXT:    orb %r13b, %al
3285; AVX512F-NEXT:    kmovw %eax, %k0
3286; AVX512F-NEXT:    testq %r9, %r9
3287; AVX512F-NEXT:    setne %al
3288; AVX512F-NEXT:    testq %rsi, %rsi
3289; AVX512F-NEXT:    setne %cl
3290; AVX512F-NEXT:    andb %al, %cl
3291; AVX512F-NEXT:    movq %rsi, %rax
3292; AVX512F-NEXT:    mulq %r8
3293; AVX512F-NEXT:    movq %rax, %rsi
3294; AVX512F-NEXT:    seto %bpl
3295; AVX512F-NEXT:    movq %r9, %rax
3296; AVX512F-NEXT:    mulq %r11
3297; AVX512F-NEXT:    movq %rax, %rdi
3298; AVX512F-NEXT:    seto %bl
3299; AVX512F-NEXT:    orb %bpl, %bl
3300; AVX512F-NEXT:    addq %rsi, %rdi
3301; AVX512F-NEXT:    movq %r11, %rax
3302; AVX512F-NEXT:    mulq %r8
3303; AVX512F-NEXT:    addq %rdi, %rdx
3304; AVX512F-NEXT:    setb %sil
3305; AVX512F-NEXT:    orb %bl, %sil
3306; AVX512F-NEXT:    orb %cl, %sil
3307; AVX512F-NEXT:    andl $1, %esi
3308; AVX512F-NEXT:    kmovw %esi, %k1
3309; AVX512F-NEXT:    kshiftlw $1, %k0, %k0
3310; AVX512F-NEXT:    korw %k0, %k1, %k1
3311; AVX512F-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3312; AVX512F-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3313; AVX512F-NEXT:    movq %r10, 16(%r14)
3314; AVX512F-NEXT:    movq %rax, (%r14)
3315; AVX512F-NEXT:    movq %r15, 24(%r14)
3316; AVX512F-NEXT:    movq %rdx, 8(%r14)
3317; AVX512F-NEXT:    popq %rbx
3318; AVX512F-NEXT:    popq %r12
3319; AVX512F-NEXT:    popq %r13
3320; AVX512F-NEXT:    popq %r14
3321; AVX512F-NEXT:    popq %r15
3322; AVX512F-NEXT:    popq %rbp
3323; AVX512F-NEXT:    retq
3324;
3325; AVX512BW-LABEL: umulo_v2i128:
3326; AVX512BW:       # %bb.0:
3327; AVX512BW-NEXT:    pushq %rbp
3328; AVX512BW-NEXT:    pushq %r15
3329; AVX512BW-NEXT:    pushq %r14
3330; AVX512BW-NEXT:    pushq %r13
3331; AVX512BW-NEXT:    pushq %r12
3332; AVX512BW-NEXT:    pushq %rbx
3333; AVX512BW-NEXT:    movq %rcx, %rax
3334; AVX512BW-NEXT:    movq %rdx, %r12
3335; AVX512BW-NEXT:    movq %rdi, %r11
3336; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r14
3337; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r15
3338; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
3339; AVX512BW-NEXT:    testq %r10, %r10
3340; AVX512BW-NEXT:    setne %dl
3341; AVX512BW-NEXT:    testq %rcx, %rcx
3342; AVX512BW-NEXT:    setne %r13b
3343; AVX512BW-NEXT:    andb %dl, %r13b
3344; AVX512BW-NEXT:    mulq %r15
3345; AVX512BW-NEXT:    movq %rax, %rdi
3346; AVX512BW-NEXT:    seto %bpl
3347; AVX512BW-NEXT:    movq %r10, %rax
3348; AVX512BW-NEXT:    mulq %r12
3349; AVX512BW-NEXT:    movq %rax, %rbx
3350; AVX512BW-NEXT:    seto %cl
3351; AVX512BW-NEXT:    orb %bpl, %cl
3352; AVX512BW-NEXT:    addq %rdi, %rbx
3353; AVX512BW-NEXT:    movq %r12, %rax
3354; AVX512BW-NEXT:    mulq %r15
3355; AVX512BW-NEXT:    movq %rax, %r10
3356; AVX512BW-NEXT:    movq %rdx, %r15
3357; AVX512BW-NEXT:    addq %rbx, %r15
3358; AVX512BW-NEXT:    setb %al
3359; AVX512BW-NEXT:    orb %cl, %al
3360; AVX512BW-NEXT:    orb %r13b, %al
3361; AVX512BW-NEXT:    kmovd %eax, %k0
3362; AVX512BW-NEXT:    testq %r9, %r9
3363; AVX512BW-NEXT:    setne %al
3364; AVX512BW-NEXT:    testq %rsi, %rsi
3365; AVX512BW-NEXT:    setne %cl
3366; AVX512BW-NEXT:    andb %al, %cl
3367; AVX512BW-NEXT:    movq %rsi, %rax
3368; AVX512BW-NEXT:    mulq %r8
3369; AVX512BW-NEXT:    movq %rax, %rsi
3370; AVX512BW-NEXT:    seto %bpl
3371; AVX512BW-NEXT:    movq %r9, %rax
3372; AVX512BW-NEXT:    mulq %r11
3373; AVX512BW-NEXT:    movq %rax, %rdi
3374; AVX512BW-NEXT:    seto %bl
3375; AVX512BW-NEXT:    orb %bpl, %bl
3376; AVX512BW-NEXT:    addq %rsi, %rdi
3377; AVX512BW-NEXT:    movq %r11, %rax
3378; AVX512BW-NEXT:    mulq %r8
3379; AVX512BW-NEXT:    addq %rdi, %rdx
3380; AVX512BW-NEXT:    setb %sil
3381; AVX512BW-NEXT:    orb %bl, %sil
3382; AVX512BW-NEXT:    orb %cl, %sil
3383; AVX512BW-NEXT:    andl $1, %esi
3384; AVX512BW-NEXT:    kmovw %esi, %k1
3385; AVX512BW-NEXT:    kshiftlw $1, %k0, %k0
3386; AVX512BW-NEXT:    korw %k0, %k1, %k1
3387; AVX512BW-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
3388; AVX512BW-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
3389; AVX512BW-NEXT:    movq %r10, 16(%r14)
3390; AVX512BW-NEXT:    movq %rax, (%r14)
3391; AVX512BW-NEXT:    movq %r15, 24(%r14)
3392; AVX512BW-NEXT:    movq %rdx, 8(%r14)
3393; AVX512BW-NEXT:    popq %rbx
3394; AVX512BW-NEXT:    popq %r12
3395; AVX512BW-NEXT:    popq %r13
3396; AVX512BW-NEXT:    popq %r14
3397; AVX512BW-NEXT:    popq %r15
3398; AVX512BW-NEXT:    popq %rbp
3399; AVX512BW-NEXT:    retq
3400  %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1)
3401  %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0
3402  %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1
3403  %res = sext <2 x i1> %obit to <2 x i32>
3404  store <2 x i128> %val, <2 x i128>* %p2
3405  ret <2 x i32> %res
3406}
3407