1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1
3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2
4; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F
5; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW
6; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ
7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F
10; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW
11; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ
12
13;
14; Subvector Load + Broadcast
15;
16
17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
18; X32-AVX-LABEL: test_broadcast_2f64_4f64:
19; X32-AVX:       ## BB#0:
20; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
21; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
22; X32-AVX-NEXT:    retl
23;
24; X32-AVX512F-LABEL: test_broadcast_2f64_4f64:
25; X32-AVX512F:       ## BB#0:
26; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
27; X32-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
28; X32-AVX512F-NEXT:    retl
29;
30; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64:
31; X32-AVX512BW:       ## BB#0:
32; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
33; X32-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
34; X32-AVX512BW-NEXT:    retl
35;
36; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
37; X32-AVX512DQ:       ## BB#0:
38; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
39; X32-AVX512DQ-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
40; X32-AVX512DQ-NEXT:    retl
41;
42; X64-AVX-LABEL: test_broadcast_2f64_4f64:
43; X64-AVX:       ## BB#0:
44; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
45; X64-AVX-NEXT:    retq
46;
47; X64-AVX512F-LABEL: test_broadcast_2f64_4f64:
48; X64-AVX512F:       ## BB#0:
49; X64-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
50; X64-AVX512F-NEXT:    retq
51;
52; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64:
53; X64-AVX512BW:       ## BB#0:
54; X64-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
55; X64-AVX512BW-NEXT:    retq
56;
57; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
58; X64-AVX512DQ:       ## BB#0:
59; X64-AVX512DQ-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
60; X64-AVX512DQ-NEXT:    retq
61 %1 = load <2 x double>, <2 x double> *%p
62 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
63 ret <4 x double> %2
64}
65
66define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind {
67; X32-AVX-LABEL: test_broadcast_2f64_8f64:
68; X32-AVX:       ## BB#0:
69; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
70; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
71; X32-AVX-NEXT:    vmovdqa %ymm0, %ymm1
72; X32-AVX-NEXT:    retl
73;
74; X32-AVX512F-LABEL: test_broadcast_2f64_8f64:
75; X32-AVX512F:       ## BB#0:
76; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
77; X32-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
78; X32-AVX512F-NEXT:    retl
79;
80; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64:
81; X32-AVX512BW:       ## BB#0:
82; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
83; X32-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
84; X32-AVX512BW-NEXT:    retl
85;
86; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64:
87; X32-AVX512DQ:       ## BB#0:
88; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
89; X32-AVX512DQ-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
90; X32-AVX512DQ-NEXT:    retl
91;
92; X64-AVX-LABEL: test_broadcast_2f64_8f64:
93; X64-AVX:       ## BB#0:
94; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
95; X64-AVX-NEXT:    vmovdqa %ymm0, %ymm1
96; X64-AVX-NEXT:    retq
97;
98; X64-AVX512F-LABEL: test_broadcast_2f64_8f64:
99; X64-AVX512F:       ## BB#0:
100; X64-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
101; X64-AVX512F-NEXT:    retq
102;
103; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64:
104; X64-AVX512BW:       ## BB#0:
105; X64-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
106; X64-AVX512BW-NEXT:    retq
107;
108; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64:
109; X64-AVX512DQ:       ## BB#0:
110; X64-AVX512DQ-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
111; X64-AVX512DQ-NEXT:    retq
112 %1 = load <2 x double>, <2 x double> *%p
113 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
114 ret <8 x double> %2
115}
116
117define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind {
118; X32-AVX-LABEL: test_broadcast_4f64_8f64:
119; X32-AVX:       ## BB#0:
120; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
121; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
122; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
123; X32-AVX-NEXT:    retl
124;
125; X32-AVX512-LABEL: test_broadcast_4f64_8f64:
126; X32-AVX512:       ## BB#0:
127; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
128; X32-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
129; X32-AVX512-NEXT:    retl
130;
131; X64-AVX-LABEL: test_broadcast_4f64_8f64:
132; X64-AVX:       ## BB#0:
133; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
134; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
135; X64-AVX-NEXT:    retq
136;
137; X64-AVX512-LABEL: test_broadcast_4f64_8f64:
138; X64-AVX512:       ## BB#0:
139; X64-AVX512-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
140; X64-AVX512-NEXT:    retq
141 %1 = load <4 x double>, <4 x double> *%p
142 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
143 ret <8 x double> %2
144}
145
146define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
147; X32-AVX-LABEL: test_broadcast_2i64_4i64:
148; X32-AVX:       ## BB#0:
149; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
150; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
151; X32-AVX-NEXT:    retl
152;
153; X32-AVX512F-LABEL: test_broadcast_2i64_4i64:
154; X32-AVX512F:       ## BB#0:
155; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
156; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
157; X32-AVX512F-NEXT:    retl
158;
159; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64:
160; X32-AVX512BW:       ## BB#0:
161; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
162; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
163; X32-AVX512BW-NEXT:    retl
164;
165; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
166; X32-AVX512DQ:       ## BB#0:
167; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
168; X32-AVX512DQ-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
169; X32-AVX512DQ-NEXT:    retl
170;
171; X64-AVX-LABEL: test_broadcast_2i64_4i64:
172; X64-AVX:       ## BB#0:
173; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
174; X64-AVX-NEXT:    retq
175;
176; X64-AVX512F-LABEL: test_broadcast_2i64_4i64:
177; X64-AVX512F:       ## BB#0:
178; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
179; X64-AVX512F-NEXT:    retq
180;
181; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64:
182; X64-AVX512BW:       ## BB#0:
183; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
184; X64-AVX512BW-NEXT:    retq
185;
186; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
187; X64-AVX512DQ:       ## BB#0:
188; X64-AVX512DQ-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1]
189; X64-AVX512DQ-NEXT:    retq
190 %1 = load <2 x i64>, <2 x i64> *%p
191 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
192 ret <4 x i64> %2
193}
194
195define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind {
196; X32-AVX1-LABEL: test_broadcast_2i64_8i64:
197; X32-AVX1:       ## BB#0:
198; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
199; X32-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
200; X32-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
201; X32-AVX1-NEXT:    retl
202;
203; X32-AVX2-LABEL: test_broadcast_2i64_8i64:
204; X32-AVX2:       ## BB#0:
205; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
206; X32-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
207; X32-AVX2-NEXT:    vmovaps %ymm0, %ymm1
208; X32-AVX2-NEXT:    retl
209;
210; X32-AVX512F-LABEL: test_broadcast_2i64_8i64:
211; X32-AVX512F:       ## BB#0:
212; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
213; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
214; X32-AVX512F-NEXT:    retl
215;
216; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64:
217; X32-AVX512BW:       ## BB#0:
218; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
219; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
220; X32-AVX512BW-NEXT:    retl
221;
222; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64:
223; X32-AVX512DQ:       ## BB#0:
224; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
225; X32-AVX512DQ-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
226; X32-AVX512DQ-NEXT:    retl
227;
228; X64-AVX1-LABEL: test_broadcast_2i64_8i64:
229; X64-AVX1:       ## BB#0:
230; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
231; X64-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
232; X64-AVX1-NEXT:    retq
233;
234; X64-AVX2-LABEL: test_broadcast_2i64_8i64:
235; X64-AVX2:       ## BB#0:
236; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
237; X64-AVX2-NEXT:    vmovaps %ymm0, %ymm1
238; X64-AVX2-NEXT:    retq
239;
240; X64-AVX512F-LABEL: test_broadcast_2i64_8i64:
241; X64-AVX512F:       ## BB#0:
242; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
243; X64-AVX512F-NEXT:    retq
244;
245; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64:
246; X64-AVX512BW:       ## BB#0:
247; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
248; X64-AVX512BW-NEXT:    retq
249;
250; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64:
251; X64-AVX512DQ:       ## BB#0:
252; X64-AVX512DQ-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1]
253; X64-AVX512DQ-NEXT:    retq
254 %1 = load <2 x i64>, <2 x i64> *%p
255 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
256 ret <8 x i64> %2
257}
258
259define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
260; X32-AVX-LABEL: test_broadcast_4i64_8i64:
261; X32-AVX:       ## BB#0:
262; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
263; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
264; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
265; X32-AVX-NEXT:    retl
266;
267; X32-AVX512-LABEL: test_broadcast_4i64_8i64:
268; X32-AVX512:       ## BB#0:
269; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
270; X32-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
271; X32-AVX512-NEXT:    retl
272;
273; X64-AVX-LABEL: test_broadcast_4i64_8i64:
274; X64-AVX:       ## BB#0:
275; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
276; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
277; X64-AVX-NEXT:    retq
278;
279; X64-AVX512-LABEL: test_broadcast_4i64_8i64:
280; X64-AVX512:       ## BB#0:
281; X64-AVX512-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
282; X64-AVX512-NEXT:    retq
283 %1 = load <4 x i64>, <4 x i64> *%p
284 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
285 ret <8 x i64> %2
286}
287
288define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
289; X32-AVX-LABEL: test_broadcast_4f32_8f32:
290; X32-AVX:       ## BB#0:
291; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
292; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
293; X32-AVX-NEXT:    retl
294;
295; X32-AVX512-LABEL: test_broadcast_4f32_8f32:
296; X32-AVX512:       ## BB#0:
297; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
298; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
299; X32-AVX512-NEXT:    retl
300;
301; X64-AVX-LABEL: test_broadcast_4f32_8f32:
302; X64-AVX:       ## BB#0:
303; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
304; X64-AVX-NEXT:    retq
305;
306; X64-AVX512-LABEL: test_broadcast_4f32_8f32:
307; X64-AVX512:       ## BB#0:
308; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
309; X64-AVX512-NEXT:    retq
310 %1 = load <4 x float>, <4 x float> *%p
311 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
312 ret <8 x float> %2
313}
314
315define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind {
316; X32-AVX-LABEL: test_broadcast_4f32_16f32:
317; X32-AVX:       ## BB#0:
318; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
319; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
320; X32-AVX-NEXT:    vmovdqa %ymm0, %ymm1
321; X32-AVX-NEXT:    retl
322;
323; X32-AVX512-LABEL: test_broadcast_4f32_16f32:
324; X32-AVX512:       ## BB#0:
325; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
326; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
327; X32-AVX512-NEXT:    retl
328;
329; X64-AVX-LABEL: test_broadcast_4f32_16f32:
330; X64-AVX:       ## BB#0:
331; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
332; X64-AVX-NEXT:    vmovdqa %ymm0, %ymm1
333; X64-AVX-NEXT:    retq
334;
335; X64-AVX512-LABEL: test_broadcast_4f32_16f32:
336; X64-AVX512:       ## BB#0:
337; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
338; X64-AVX512-NEXT:    retq
339 %1 = load <4 x float>, <4 x float> *%p
340 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
341 ret <16 x float> %2
342}
343
344define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind {
345; X32-AVX-LABEL: test_broadcast_8f32_16f32:
346; X32-AVX:       ## BB#0:
347; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
348; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
349; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
350; X32-AVX-NEXT:    retl
351;
352; X32-AVX512F-LABEL: test_broadcast_8f32_16f32:
353; X32-AVX512F:       ## BB#0:
354; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
355; X32-AVX512F-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
356; X32-AVX512F-NEXT:    retl
357;
358; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32:
359; X32-AVX512BW:       ## BB#0:
360; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
361; X32-AVX512BW-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
362; X32-AVX512BW-NEXT:    retl
363;
364; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
365; X32-AVX512DQ:       ## BB#0:
366; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
367; X32-AVX512DQ-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
368; X32-AVX512DQ-NEXT:    retl
369;
370; X64-AVX-LABEL: test_broadcast_8f32_16f32:
371; X64-AVX:       ## BB#0:
372; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
373; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
374; X64-AVX-NEXT:    retq
375;
376; X64-AVX512F-LABEL: test_broadcast_8f32_16f32:
377; X64-AVX512F:       ## BB#0:
378; X64-AVX512F-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
379; X64-AVX512F-NEXT:    retq
380;
381; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32:
382; X64-AVX512BW:       ## BB#0:
383; X64-AVX512BW-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
384; X64-AVX512BW-NEXT:    retq
385;
386; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32:
387; X64-AVX512DQ:       ## BB#0:
388; X64-AVX512DQ-NEXT:    vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
389; X64-AVX512DQ-NEXT:    retq
390 %1 = load <8 x float>, <8 x float> *%p
391 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
392 ret <16 x float> %2
393}
394
395define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
396; X32-AVX-LABEL: test_broadcast_4i32_8i32:
397; X32-AVX:       ## BB#0:
398; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
399; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
400; X32-AVX-NEXT:    retl
401;
402; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
403; X32-AVX512:       ## BB#0:
404; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
405; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
406; X32-AVX512-NEXT:    retl
407;
408; X64-AVX-LABEL: test_broadcast_4i32_8i32:
409; X64-AVX:       ## BB#0:
410; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
411; X64-AVX-NEXT:    retq
412;
413; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
414; X64-AVX512:       ## BB#0:
415; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
416; X64-AVX512-NEXT:    retq
417 %1 = load <4 x i32>, <4 x i32> *%p
418 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
419 ret <8 x i32> %2
420}
421
422define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
423; X32-AVX1-LABEL: test_broadcast_4i32_16i32:
424; X32-AVX1:       ## BB#0:
425; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
426; X32-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
427; X32-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
428; X32-AVX1-NEXT:    retl
429;
430; X32-AVX2-LABEL: test_broadcast_4i32_16i32:
431; X32-AVX2:       ## BB#0:
432; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
433; X32-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
434; X32-AVX2-NEXT:    vmovaps %ymm0, %ymm1
435; X32-AVX2-NEXT:    retl
436;
437; X32-AVX512-LABEL: test_broadcast_4i32_16i32:
438; X32-AVX512:       ## BB#0:
439; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
440; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
441; X32-AVX512-NEXT:    retl
442;
443; X64-AVX1-LABEL: test_broadcast_4i32_16i32:
444; X64-AVX1:       ## BB#0:
445; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
446; X64-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
447; X64-AVX1-NEXT:    retq
448;
449; X64-AVX2-LABEL: test_broadcast_4i32_16i32:
450; X64-AVX2:       ## BB#0:
451; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
452; X64-AVX2-NEXT:    vmovaps %ymm0, %ymm1
453; X64-AVX2-NEXT:    retq
454;
455; X64-AVX512-LABEL: test_broadcast_4i32_16i32:
456; X64-AVX512:       ## BB#0:
457; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
458; X64-AVX512-NEXT:    retq
459 %1 = load <4 x i32>, <4 x i32> *%p
460 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
461 ret <16 x i32> %2
462}
463
464define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind {
465; X32-AVX-LABEL: test_broadcast_8i32_16i32:
466; X32-AVX:       ## BB#0:
467; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
468; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
469; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
470; X32-AVX-NEXT:    retl
471;
472; X32-AVX512F-LABEL: test_broadcast_8i32_16i32:
473; X32-AVX512F:       ## BB#0:
474; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
475; X32-AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
476; X32-AVX512F-NEXT:    retl
477;
478; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32:
479; X32-AVX512BW:       ## BB#0:
480; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
481; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
482; X32-AVX512BW-NEXT:    retl
483;
484; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
485; X32-AVX512DQ:       ## BB#0:
486; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
487; X32-AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
488; X32-AVX512DQ-NEXT:    retl
489;
490; X64-AVX-LABEL: test_broadcast_8i32_16i32:
491; X64-AVX:       ## BB#0:
492; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
493; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
494; X64-AVX-NEXT:    retq
495;
496; X64-AVX512F-LABEL: test_broadcast_8i32_16i32:
497; X64-AVX512F:       ## BB#0:
498; X64-AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
499; X64-AVX512F-NEXT:    retq
500;
501; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32:
502; X64-AVX512BW:       ## BB#0:
503; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
504; X64-AVX512BW-NEXT:    retq
505;
506; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32:
507; X64-AVX512DQ:       ## BB#0:
508; X64-AVX512DQ-NEXT:    vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7]
509; X64-AVX512DQ-NEXT:    retq
510 %1 = load <8 x i32>, <8 x i32> *%p
511 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
512 ret <16 x i32> %2
513}
514
515define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
516; X32-AVX-LABEL: test_broadcast_8i16_16i16:
517; X32-AVX:       ## BB#0:
518; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
519; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
520; X32-AVX-NEXT:    retl
521;
522; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
523; X32-AVX512:       ## BB#0:
524; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
525; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
526; X32-AVX512-NEXT:    retl
527;
528; X64-AVX-LABEL: test_broadcast_8i16_16i16:
529; X64-AVX:       ## BB#0:
530; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
531; X64-AVX-NEXT:    retq
532;
533; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
534; X64-AVX512:       ## BB#0:
535; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
536; X64-AVX512-NEXT:    retq
537 %1 = load <8 x i16>, <8 x i16> *%p
538 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
539 ret <16 x i16> %2
540}
541
542define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
543; X32-AVX1-LABEL: test_broadcast_8i16_32i16:
544; X32-AVX1:       ## BB#0:
545; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
546; X32-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
547; X32-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
548; X32-AVX1-NEXT:    retl
549;
550; X32-AVX2-LABEL: test_broadcast_8i16_32i16:
551; X32-AVX2:       ## BB#0:
552; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
553; X32-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
554; X32-AVX2-NEXT:    vmovaps %ymm0, %ymm1
555; X32-AVX2-NEXT:    retl
556;
557; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
558; X32-AVX512F:       ## BB#0:
559; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
560; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
561; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
562; X32-AVX512F-NEXT:    retl
563;
564; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16:
565; X32-AVX512BW:       ## BB#0:
566; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
567; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
568; X32-AVX512BW-NEXT:    retl
569;
570; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
571; X32-AVX512DQ:       ## BB#0:
572; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
573; X32-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
574; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
575; X32-AVX512DQ-NEXT:    retl
576;
577; X64-AVX1-LABEL: test_broadcast_8i16_32i16:
578; X64-AVX1:       ## BB#0:
579; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
580; X64-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
581; X64-AVX1-NEXT:    retq
582;
583; X64-AVX2-LABEL: test_broadcast_8i16_32i16:
584; X64-AVX2:       ## BB#0:
585; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
586; X64-AVX2-NEXT:    vmovaps %ymm0, %ymm1
587; X64-AVX2-NEXT:    retq
588;
589; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
590; X64-AVX512F:       ## BB#0:
591; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
592; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
593; X64-AVX512F-NEXT:    retq
594;
595; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16:
596; X64-AVX512BW:       ## BB#0:
597; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
598; X64-AVX512BW-NEXT:    retq
599;
600; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
601; X64-AVX512DQ:       ## BB#0:
602; X64-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
603; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
604; X64-AVX512DQ-NEXT:    retq
605 %1 = load <8 x i16>, <8 x i16> *%p
606 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
607 ret <32 x i16> %2
608}
609
610define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind {
611; X32-AVX-LABEL: test_broadcast_16i16_32i16:
612; X32-AVX:       ## BB#0:
613; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
614; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
615; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
616; X32-AVX-NEXT:    retl
617;
618; X32-AVX512F-LABEL: test_broadcast_16i16_32i16:
619; X32-AVX512F:       ## BB#0:
620; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
621; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
622; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
623; X32-AVX512F-NEXT:    retl
624;
625; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16:
626; X32-AVX512BW:       ## BB#0:
627; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
628; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
629; X32-AVX512BW-NEXT:    retl
630;
631; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
632; X32-AVX512DQ:       ## BB#0:
633; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
634; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
635; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
636; X32-AVX512DQ-NEXT:    retl
637;
638; X64-AVX-LABEL: test_broadcast_16i16_32i16:
639; X64-AVX:       ## BB#0:
640; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
641; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
642; X64-AVX-NEXT:    retq
643;
644; X64-AVX512F-LABEL: test_broadcast_16i16_32i16:
645; X64-AVX512F:       ## BB#0:
646; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
647; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
648; X64-AVX512F-NEXT:    retq
649;
650; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16:
651; X64-AVX512BW:       ## BB#0:
652; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
653; X64-AVX512BW-NEXT:    retq
654;
655; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16:
656; X64-AVX512DQ:       ## BB#0:
657; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
658; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
659; X64-AVX512DQ-NEXT:    retq
660 %1 = load <16 x i16>, <16 x i16> *%p
661 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
662 ret <32 x i16> %2
663}
664
665define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
666; X32-AVX-LABEL: test_broadcast_16i8_32i8:
667; X32-AVX:       ## BB#0:
668; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
669; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
670; X32-AVX-NEXT:    retl
671;
672; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
673; X32-AVX512:       ## BB#0:
674; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
675; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
676; X32-AVX512-NEXT:    retl
677;
678; X64-AVX-LABEL: test_broadcast_16i8_32i8:
679; X64-AVX:       ## BB#0:
680; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
681; X64-AVX-NEXT:    retq
682;
683; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
684; X64-AVX512:       ## BB#0:
685; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
686; X64-AVX512-NEXT:    retq
687 %1 = load <16 x i8>, <16 x i8> *%p
688 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
689 ret <32 x i8> %2
690}
691
692define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
693; X32-AVX1-LABEL: test_broadcast_16i8_64i8:
694; X32-AVX1:       ## BB#0:
695; X32-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
696; X32-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
697; X32-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
698; X32-AVX1-NEXT:    retl
699;
700; X32-AVX2-LABEL: test_broadcast_16i8_64i8:
701; X32-AVX2:       ## BB#0:
702; X32-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
703; X32-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
704; X32-AVX2-NEXT:    vmovaps %ymm0, %ymm1
705; X32-AVX2-NEXT:    retl
706;
707; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
708; X32-AVX512F:       ## BB#0:
709; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
710; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
711; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
712; X32-AVX512F-NEXT:    retl
713;
714; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8:
715; X32-AVX512BW:       ## BB#0:
716; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
717; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
718; X32-AVX512BW-NEXT:    retl
719;
720; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
721; X32-AVX512DQ:       ## BB#0:
722; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
723; X32-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
724; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
725; X32-AVX512DQ-NEXT:    retl
726;
727; X64-AVX1-LABEL: test_broadcast_16i8_64i8:
728; X64-AVX1:       ## BB#0:
729; X64-AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
730; X64-AVX1-NEXT:    vmovdqa %ymm0, %ymm1
731; X64-AVX1-NEXT:    retq
732;
733; X64-AVX2-LABEL: test_broadcast_16i8_64i8:
734; X64-AVX2:       ## BB#0:
735; X64-AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
736; X64-AVX2-NEXT:    vmovaps %ymm0, %ymm1
737; X64-AVX2-NEXT:    retq
738;
739; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
740; X64-AVX512F:       ## BB#0:
741; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
742; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
743; X64-AVX512F-NEXT:    retq
744;
745; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8:
746; X64-AVX512BW:       ## BB#0:
747; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
748; X64-AVX512BW-NEXT:    retq
749;
750; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
751; X64-AVX512DQ:       ## BB#0:
752; X64-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
753; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
754; X64-AVX512DQ-NEXT:    retq
755 %1 = load <16 x i8>, <16 x i8> *%p
756 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
757 ret <64 x i8> %2
758}
759
760define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind {
761; X32-AVX-LABEL: test_broadcast_32i8_64i8:
762; X32-AVX:       ## BB#0:
763; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
764; X32-AVX-NEXT:    vmovaps (%eax), %ymm0
765; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
766; X32-AVX-NEXT:    retl
767;
768; X32-AVX512F-LABEL: test_broadcast_32i8_64i8:
769; X32-AVX512F:       ## BB#0:
770; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
771; X32-AVX512F-NEXT:    vmovaps (%eax), %ymm0
772; X32-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
773; X32-AVX512F-NEXT:    retl
774;
775; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8:
776; X32-AVX512BW:       ## BB#0:
777; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
778; X32-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
779; X32-AVX512BW-NEXT:    retl
780;
781; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
782; X32-AVX512DQ:       ## BB#0:
783; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
784; X32-AVX512DQ-NEXT:    vmovaps (%eax), %ymm0
785; X32-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
786; X32-AVX512DQ-NEXT:    retl
787;
788; X64-AVX-LABEL: test_broadcast_32i8_64i8:
789; X64-AVX:       ## BB#0:
790; X64-AVX-NEXT:    vmovaps (%rdi), %ymm0
791; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
792; X64-AVX-NEXT:    retq
793;
794; X64-AVX512F-LABEL: test_broadcast_32i8_64i8:
795; X64-AVX512F:       ## BB#0:
796; X64-AVX512F-NEXT:    vmovaps (%rdi), %ymm0
797; X64-AVX512F-NEXT:    vmovaps %ymm0, %ymm1
798; X64-AVX512F-NEXT:    retq
799;
800; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8:
801; X64-AVX512BW:       ## BB#0:
802; X64-AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
803; X64-AVX512BW-NEXT:    retq
804;
805; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8:
806; X64-AVX512DQ:       ## BB#0:
807; X64-AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
808; X64-AVX512DQ-NEXT:    vmovaps %ymm0, %ymm1
809; X64-AVX512DQ-NEXT:    retq
810 %1 = load <32 x i8>, <32 x i8> *%p
811 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
812 ret <64 x i8> %2
813}
814
815;
816; Subvector Load + Broadcast + Store
817;
818
819define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) {
820; X32-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
821; X32-AVX:       ## BB#0:
822; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
823; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
824; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
825; X32-AVX-NEXT:    vmovaps %xmm0, (%eax)
826; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
827; X32-AVX-NEXT:    retl
828;
829; X32-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse:
830; X32-AVX512F:       ## BB#0:
831; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
832; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
833; X32-AVX512F-NEXT:    vmovaps (%ecx), %xmm0
834; X32-AVX512F-NEXT:    vmovaps %xmm0, (%eax)
835; X32-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
836; X32-AVX512F-NEXT:    retl
837;
838; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
839; X32-AVX512BW:       ## BB#0:
840; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
841; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
842; X32-AVX512BW-NEXT:    vmovaps (%ecx), %xmm0
843; X32-AVX512BW-NEXT:    vmovaps %xmm0, (%eax)
844; X32-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
845; X32-AVX512BW-NEXT:    retl
846;
847; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
848; X32-AVX512DQ:       ## BB#0:
849; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
850; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
851; X32-AVX512DQ-NEXT:    vmovapd (%ecx), %xmm0
852; X32-AVX512DQ-NEXT:    vmovapd %xmm0, (%eax)
853; X32-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
854; X32-AVX512DQ-NEXT:    retl
855;
856; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse:
857; X64-AVX:       ## BB#0:
858; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
859; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
860; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
861; X64-AVX-NEXT:    retq
862;
863; X64-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse:
864; X64-AVX512F:       ## BB#0:
865; X64-AVX512F-NEXT:    vmovaps (%rdi), %xmm0
866; X64-AVX512F-NEXT:    vmovaps %xmm0, (%rsi)
867; X64-AVX512F-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
868; X64-AVX512F-NEXT:    retq
869;
870; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse:
871; X64-AVX512BW:       ## BB#0:
872; X64-AVX512BW-NEXT:    vmovaps (%rdi), %xmm0
873; X64-AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
874; X64-AVX512BW-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
875; X64-AVX512BW-NEXT:    retq
876;
877; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse:
878; X64-AVX512DQ:       ## BB#0:
879; X64-AVX512DQ-NEXT:    vmovapd (%rdi), %xmm0
880; X64-AVX512DQ-NEXT:    vmovapd %xmm0, (%rsi)
881; X64-AVX512DQ-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
882; X64-AVX512DQ-NEXT:    retq
883 %1 = load <2 x double>, <2 x double>* %p0
884 store <2 x double> %1, <2 x double>* %p1
885 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
886 ret <4 x double> %2
887}
888
889define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) {
890; X32-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
891; X32-AVX:       ## BB#0:
892; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
893; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
894; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
895; X32-AVX-NEXT:    vmovaps %xmm0, (%eax)
896; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
897; X32-AVX-NEXT:    retl
898;
899; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
900; X32-AVX512:       ## BB#0:
901; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
902; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
903; X32-AVX512-NEXT:    vmovdqa (%ecx), %xmm0
904; X32-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
905; X32-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
906; X32-AVX512-NEXT:    retl
907;
908; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse:
909; X64-AVX:       ## BB#0:
910; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
911; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
912; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
913; X64-AVX-NEXT:    retq
914;
915; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse:
916; X64-AVX512:       ## BB#0:
917; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm0
918; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
919; X64-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
920; X64-AVX512-NEXT:    retq
921 %1 = load <2 x i64>, <2 x i64>* %p0
922 store <2 x i64> %1, <2 x i64>* %p1
923 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
924 ret <4 x i64> %2
925}
926
927define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) {
928; X32-LABEL: test_broadcast_4f32_8f32_reuse:
929; X32:       ## BB#0:
930; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
931; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
932; X32-NEXT:    vmovaps (%ecx), %xmm0
933; X32-NEXT:    vmovaps %xmm0, (%eax)
934; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
935; X32-NEXT:    retl
936;
937; X64-LABEL: test_broadcast_4f32_8f32_reuse:
938; X64:       ## BB#0:
939; X64-NEXT:    vmovaps (%rdi), %xmm0
940; X64-NEXT:    vmovaps %xmm0, (%rsi)
941; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
942; X64-NEXT:    retq
943 %1 = load <4 x float>, <4 x float>* %p0
944 store <4 x float> %1, <4 x float>* %p1
945 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
946 ret <8 x float> %2
947}
948
949define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) {
950; X32-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
951; X32-AVX:       ## BB#0:
952; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
953; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
954; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
955; X32-AVX-NEXT:    vmovaps %xmm0, (%eax)
956; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
957; X32-AVX-NEXT:    retl
958;
959; X32-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
960; X32-AVX512:       ## BB#0:
961; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
962; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
963; X32-AVX512-NEXT:    vmovdqa (%ecx), %xmm0
964; X32-AVX512-NEXT:    vmovdqa %xmm0, (%eax)
965; X32-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
966; X32-AVX512-NEXT:    retl
967;
968; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse:
969; X64-AVX:       ## BB#0:
970; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
971; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
972; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
973; X64-AVX-NEXT:    retq
974;
975; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse:
976; X64-AVX512:       ## BB#0:
977; X64-AVX512-NEXT:    vmovdqa (%rdi), %xmm0
978; X64-AVX512-NEXT:    vmovdqa %xmm0, (%rsi)
979; X64-AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
980; X64-AVX512-NEXT:    retq
981 %1 = load <4 x i32>, <4 x i32>* %p0
982 store <4 x i32> %1, <4 x i32>* %p1
983 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
984 ret <8 x i32> %2
985}
986
987define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind {
988; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
989; X32-AVX:       ## BB#0:
990; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
991; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
992; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
993; X32-AVX-NEXT:    vmovaps %xmm0, (%eax)
994; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
995; X32-AVX-NEXT:    retl
996;
997; X32-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
998; X32-AVX512F:       ## BB#0:
999; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
1000; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1001; X32-AVX512F-NEXT:    vmovdqa (%ecx), %xmm0
1002; X32-AVX512F-NEXT:    vmovdqa %xmm0, (%eax)
1003; X32-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1004; X32-AVX512F-NEXT:    retl
1005;
1006; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
1007; X32-AVX512BW:       ## BB#0:
1008; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
1009; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1010; X32-AVX512BW-NEXT:    vmovdqu (%ecx), %xmm0
1011; X32-AVX512BW-NEXT:    vmovdqu %xmm0, (%eax)
1012; X32-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1013; X32-AVX512BW-NEXT:    retl
1014;
1015; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
1016; X32-AVX512DQ:       ## BB#0:
1017; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
1018; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1019; X32-AVX512DQ-NEXT:    vmovdqa (%ecx), %xmm0
1020; X32-AVX512DQ-NEXT:    vmovdqa %xmm0, (%eax)
1021; X32-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1022; X32-AVX512DQ-NEXT:    retl
1023;
1024; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse:
1025; X64-AVX:       ## BB#0:
1026; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
1027; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
1028; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1029; X64-AVX-NEXT:    retq
1030;
1031; X64-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse:
1032; X64-AVX512F:       ## BB#0:
1033; X64-AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1034; X64-AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
1035; X64-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1036; X64-AVX512F-NEXT:    retq
1037;
1038; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse:
1039; X64-AVX512BW:       ## BB#0:
1040; X64-AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
1041; X64-AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
1042; X64-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1043; X64-AVX512BW-NEXT:    retq
1044;
1045; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse:
1046; X64-AVX512DQ:       ## BB#0:
1047; X64-AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
1048; X64-AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsi)
1049; X64-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1050; X64-AVX512DQ-NEXT:    retq
1051 %1 = load <8 x i16>, <8 x i16> *%p0
1052 store <8 x i16> %1, <8 x i16>* %p1
1053 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1054 ret <16 x i16> %2
1055}
1056
1057define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind {
1058; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
1059; X32-AVX:       ## BB#0:
1060; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1061; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1062; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
1063; X32-AVX-NEXT:    vmovaps %xmm0, (%eax)
1064; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1065; X32-AVX-NEXT:    retl
1066;
1067; X32-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
1068; X32-AVX512F:       ## BB#0:
1069; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
1070; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1071; X32-AVX512F-NEXT:    vmovdqa (%ecx), %xmm0
1072; X32-AVX512F-NEXT:    vmovdqa %xmm0, (%eax)
1073; X32-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1074; X32-AVX512F-NEXT:    retl
1075;
1076; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
1077; X32-AVX512BW:       ## BB#0:
1078; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
1079; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1080; X32-AVX512BW-NEXT:    vmovdqu (%ecx), %xmm0
1081; X32-AVX512BW-NEXT:    vmovdqu %xmm0, (%eax)
1082; X32-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1083; X32-AVX512BW-NEXT:    retl
1084;
1085; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
1086; X32-AVX512DQ:       ## BB#0:
1087; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
1088; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1089; X32-AVX512DQ-NEXT:    vmovdqa (%ecx), %xmm0
1090; X32-AVX512DQ-NEXT:    vmovdqa %xmm0, (%eax)
1091; X32-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1092; X32-AVX512DQ-NEXT:    retl
1093;
1094; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse:
1095; X64-AVX:       ## BB#0:
1096; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
1097; X64-AVX-NEXT:    vmovaps %xmm0, (%rsi)
1098; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1099; X64-AVX-NEXT:    retq
1100;
1101; X64-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse:
1102; X64-AVX512F:       ## BB#0:
1103; X64-AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1104; X64-AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
1105; X64-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1106; X64-AVX512F-NEXT:    retq
1107;
1108; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse:
1109; X64-AVX512BW:       ## BB#0:
1110; X64-AVX512BW-NEXT:    vmovdqu (%rdi), %xmm0
1111; X64-AVX512BW-NEXT:    vmovdqu %xmm0, (%rsi)
1112; X64-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1113; X64-AVX512BW-NEXT:    retq
1114;
1115; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse:
1116; X64-AVX512DQ:       ## BB#0:
1117; X64-AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
1118; X64-AVX512DQ-NEXT:    vmovdqa %xmm0, (%rsi)
1119; X64-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1120; X64-AVX512DQ-NEXT:    retq
1121 %1 = load <16 x i8>, <16 x i8> *%p0
1122 store <16 x i8> %1, <16 x i8>* %p1
1123 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1124 ret <32 x i8> %2
1125}
1126
1127;
1128; Subvector Load + Broadcast with Separate Store
1129;
1130
1131define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
1132; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain:
1133; X32-AVX:       ## BB#0:
1134; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1135; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1136; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
1137; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1138; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
1139; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1140; X32-AVX-NEXT:    retl
1141;
1142; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
1143; X32-AVX512F:       ## BB#0:
1144; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
1145; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1146; X32-AVX512F-NEXT:    vmovdqa (%ecx), %xmm0
1147; X32-AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1148; X32-AVX512F-NEXT:    vmovdqa %xmm1, (%eax)
1149; X32-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1150; X32-AVX512F-NEXT:    retl
1151;
1152; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
1153; X32-AVX512BW:       ## BB#0:
1154; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
1155; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1156; X32-AVX512BW-NEXT:    vmovdqa (%ecx), %xmm0
1157; X32-AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1158; X32-AVX512BW-NEXT:    vmovdqa %xmm1, (%eax)
1159; X32-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1160; X32-AVX512BW-NEXT:    retl
1161;
1162; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
1163; X32-AVX512DQ:       ## BB#0:
1164; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
1165; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1166; X32-AVX512DQ-NEXT:    vmovdqa (%ecx), %xmm0
1167; X32-AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1168; X32-AVX512DQ-NEXT:    vmovaps %xmm1, (%eax)
1169; X32-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1170; X32-AVX512DQ-NEXT:    retl
1171;
1172; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain:
1173; X64-AVX:       ## BB#0:
1174; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
1175; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1176; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
1177; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1178; X64-AVX-NEXT:    retq
1179;
1180; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain:
1181; X64-AVX512F:       ## BB#0:
1182; X64-AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1183; X64-AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1184; X64-AVX512F-NEXT:    vmovdqa %xmm1, (%rsi)
1185; X64-AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1186; X64-AVX512F-NEXT:    retq
1187;
1188; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain:
1189; X64-AVX512BW:       ## BB#0:
1190; X64-AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1191; X64-AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1192; X64-AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
1193; X64-AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1194; X64-AVX512BW-NEXT:    retq
1195;
1196; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain:
1197; X64-AVX512DQ:       ## BB#0:
1198; X64-AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
1199; X64-AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1200; X64-AVX512DQ-NEXT:    vmovaps %xmm1, (%rsi)
1201; X64-AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
1202; X64-AVX512DQ-NEXT:    retq
1203  %1 = load <4 x i32>, <4 x i32>* %p0
1204  store <4 x float> zeroinitializer, <4 x float>* %p1
1205  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1206  ret <8 x i32> %2
1207}
1208
1209define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) {
1210; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain:
1211; X32-AVX:       ## BB#0:
1212; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
1213; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1214; X32-AVX-NEXT:    vmovaps (%ecx), %xmm0
1215; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1216; X32-AVX-NEXT:    vmovaps %xmm1, (%eax)
1217; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1218; X32-AVX-NEXT:    vmovaps %ymm0, %ymm1
1219; X32-AVX-NEXT:    retl
1220;
1221; X32-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
1222; X32-AVX512F:       ## BB#0:
1223; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
1224; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1225; X32-AVX512F-NEXT:    vmovdqa (%ecx), %xmm0
1226; X32-AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1227; X32-AVX512F-NEXT:    vmovdqa %xmm1, (%eax)
1228; X32-AVX512F-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1229; X32-AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1230; X32-AVX512F-NEXT:    retl
1231;
1232; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
1233; X32-AVX512BW:       ## BB#0:
1234; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
1235; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1236; X32-AVX512BW-NEXT:    vmovdqa (%ecx), %xmm0
1237; X32-AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238; X32-AVX512BW-NEXT:    vmovdqa %xmm1, (%eax)
1239; X32-AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1240; X32-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1241; X32-AVX512BW-NEXT:    retl
1242;
1243; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
1244; X32-AVX512DQ:       ## BB#0:
1245; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
1246; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1247; X32-AVX512DQ-NEXT:    vmovdqa (%ecx), %xmm0
1248; X32-AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1249; X32-AVX512DQ-NEXT:    vmovaps %xmm1, (%eax)
1250; X32-AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1251; X32-AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm0
1252; X32-AVX512DQ-NEXT:    retl
1253;
1254; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain:
1255; X64-AVX:       ## BB#0:
1256; X64-AVX-NEXT:    vmovaps (%rdi), %xmm0
1257; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1258; X64-AVX-NEXT:    vmovaps %xmm1, (%rsi)
1259; X64-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1260; X64-AVX-NEXT:    vmovaps %ymm0, %ymm1
1261; X64-AVX-NEXT:    retq
1262;
1263; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain:
1264; X64-AVX512F:       ## BB#0:
1265; X64-AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
1266; X64-AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1267; X64-AVX512F-NEXT:    vmovdqa %xmm1, (%rsi)
1268; X64-AVX512F-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1269; X64-AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1270; X64-AVX512F-NEXT:    retq
1271;
1272; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain:
1273; X64-AVX512BW:       ## BB#0:
1274; X64-AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
1275; X64-AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1276; X64-AVX512BW-NEXT:    vmovdqa %xmm1, (%rsi)
1277; X64-AVX512BW-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1278; X64-AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
1279; X64-AVX512BW-NEXT:    retq
1280;
1281; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain:
1282; X64-AVX512DQ:       ## BB#0:
1283; X64-AVX512DQ-NEXT:    vmovdqa (%rdi), %xmm0
1284; X64-AVX512DQ-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1285; X64-AVX512DQ-NEXT:    vmovaps %xmm1, (%rsi)
1286; X64-AVX512DQ-NEXT:    vinserti32x4 $1, %xmm0, %zmm0, %zmm0
1287; X64-AVX512DQ-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm0
1288; X64-AVX512DQ-NEXT:    retq
1289  %1 = load <4 x i32>, <4 x i32>* %p0
1290  store <4 x float> zeroinitializer, <4 x float>* %p1
1291  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
1292  ret <16 x i32> %2
1293}
1294
1295;
1296; subvector Load with multiple uses + broadcast
1297; Fallback to the broadcast should be done
1298;
1299
1300@ga4 = global <4 x i64> zeroinitializer, align 8
1301@gb4 = global <8 x i64> zeroinitializer, align 8
1302
1303define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
1304; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
1305; X32-AVX1:       ## BB#0: ## %entry
1306; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1307; X32-AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0]
1308; X32-AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
1309; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
1310; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
1311; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
1312; X32-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
1313; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
1314; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
1315; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
1316; X32-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
1317; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
1318; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm1, %xmm1
1319; X32-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1320; X32-AVX1-NEXT:    vandps %ymm4, %ymm1, %ymm1
1321; X32-AVX1-NEXT:    vandps %ymm4, %ymm2, %ymm2
1322; X32-AVX1-NEXT:    vmovups %ymm0, _ga4
1323; X32-AVX1-NEXT:    vmovups %ymm2, _gb4+32
1324; X32-AVX1-NEXT:    vmovups %ymm1, _gb4
1325; X32-AVX1-NEXT:    vzeroupper
1326; X32-AVX1-NEXT:    retl
1327;
1328; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
1329; X32-AVX2:       ## BB#0: ## %entry
1330; X32-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0]
1331; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
1332; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
1333; X32-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
1334; X32-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1335; X32-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
1336; X32-AVX2-NEXT:    vmovdqu %ymm0, _ga4
1337; X32-AVX2-NEXT:    vmovdqu %ymm2, _gb4+32
1338; X32-AVX2-NEXT:    vmovdqu %ymm1, _gb4
1339; X32-AVX2-NEXT:    vzeroupper
1340; X32-AVX2-NEXT:    retl
1341;
1342; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
1343; X32-AVX512:       ## BB#0: ## %entry
1344; X32-AVX512-NEXT:    vpaddq LCPI26_0, %ymm0, %ymm0
1345; X32-AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0]
1346; X32-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
1347; X32-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
1348; X32-AVX512-NEXT:    vmovdqu %ymm0, _ga4
1349; X32-AVX512-NEXT:    vmovdqu64 %zmm1, _gb4
1350; X32-AVX512-NEXT:    retl
1351;
1352; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
1353; X64-AVX1:       ## BB#0: ## %entry
1354; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
1355; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,4]
1356; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm3, %xmm3
1357; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,2]
1358; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm0, %xmm0
1359; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
1360; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm3 = [1,2,3,4]
1361; X64-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
1362; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm6
1363; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm2, %xmm2
1364; X64-AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm2, %ymm2
1365; X64-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
1366; X64-AVX1-NEXT:    vpaddq %xmm4, %xmm6, %xmm4
1367; X64-AVX1-NEXT:    vpaddq %xmm5, %xmm1, %xmm1
1368; X64-AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
1369; X64-AVX1-NEXT:    vandps %ymm3, %ymm1, %ymm1
1370; X64-AVX1-NEXT:    vandps %ymm3, %ymm2, %ymm2
1371; X64-AVX1-NEXT:    vmovups %ymm0, {{.*}}(%rip)
1372; X64-AVX1-NEXT:    vmovups %ymm2, _gb4+{{.*}}(%rip)
1373; X64-AVX1-NEXT:    vmovups %ymm1, {{.*}}(%rip)
1374; X64-AVX1-NEXT:    vzeroupper
1375; X64-AVX1-NEXT:    retq
1376;
1377; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64:
1378; X64-AVX2:       ## BB#0: ## %entry
1379; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,3,4]
1380; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
1381; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm2, %ymm2
1382; X64-AVX2-NEXT:    vpaddq %ymm3, %ymm1, %ymm1
1383; X64-AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1384; X64-AVX2-NEXT:    vpand %ymm3, %ymm2, %ymm2
1385; X64-AVX2-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
1386; X64-AVX2-NEXT:    vmovdqu %ymm2, _gb4+{{.*}}(%rip)
1387; X64-AVX2-NEXT:    vmovdqu %ymm1, {{.*}}(%rip)
1388; X64-AVX2-NEXT:    vzeroupper
1389; X64-AVX2-NEXT:    retq
1390;
1391; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64:
1392; X64-AVX512:       ## BB#0: ## %entry
1393; X64-AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,3,4]
1394; X64-AVX512-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
1395; X64-AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
1396; X64-AVX512-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
1397; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
1398; X64-AVX512-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
1399; X64-AVX512-NEXT:    vmovdqu64 %zmm1, {{.*}}(%rip)
1400; X64-AVX512-NEXT:    retq
1401entry:
1402  %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
1403  %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
1404  %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4>
1405  store <4 x i64> %0, <4 x i64>* @ga4, align 8
1406  store <8 x i64> %2, <8 x i64>* @gb4, align 8
1407  ret void
1408}
1409
1410
1411@ga2 = global <4 x double> zeroinitializer, align 8
1412@gb2 = global <8 x double> zeroinitializer, align 8
1413
1414define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) {
1415; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
1416; X32-AVX:       ## BB#0: ## %entry
1417; X32-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
1418; X32-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
1419; X32-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
1420; X32-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1421; X32-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
1422; X32-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
1423; X32-AVX-NEXT:    vmovupd %ymm0, _ga2
1424; X32-AVX-NEXT:    vmovupd %ymm2, _gb2+32
1425; X32-AVX-NEXT:    vmovupd %ymm1, _gb2
1426; X32-AVX-NEXT:    vzeroupper
1427; X32-AVX-NEXT:    retl
1428;
1429; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
1430; X32-AVX512:       ## BB#0: ## %entry
1431; X32-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
1432; X32-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1433; X32-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
1434; X32-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
1435; X32-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
1436; X32-AVX512-NEXT:    vmovupd %ymm0, _ga2
1437; X32-AVX512-NEXT:    vmovupd %zmm1, _gb2
1438; X32-AVX512-NEXT:    retl
1439;
1440; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
1441; X64-AVX:       ## BB#0: ## %entry
1442; X64-AVX-NEXT:    vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
1443; X64-AVX-NEXT:    vaddpd %ymm3, %ymm0, %ymm0
1444; X64-AVX-NEXT:    vaddpd %ymm3, %ymm2, %ymm2
1445; X64-AVX-NEXT:    vaddpd %ymm3, %ymm1, %ymm1
1446; X64-AVX-NEXT:    vdivpd %ymm3, %ymm1, %ymm1
1447; X64-AVX-NEXT:    vdivpd %ymm3, %ymm2, %ymm2
1448; X64-AVX-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
1449; X64-AVX-NEXT:    vmovupd %ymm2, _gb2+{{.*}}(%rip)
1450; X64-AVX-NEXT:    vmovupd %ymm1, {{.*}}(%rip)
1451; X64-AVX-NEXT:    vzeroupper
1452; X64-AVX-NEXT:    retq
1453;
1454; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64:
1455; X64-AVX512:       ## BB#0: ## %entry
1456; X64-AVX512-NEXT:    vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
1457; X64-AVX512-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
1458; X64-AVX512-NEXT:    vinsertf64x4 $1, %ymm2, %zmm2, %zmm2
1459; X64-AVX512-NEXT:    vaddpd %zmm2, %zmm1, %zmm1
1460; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
1461; X64-AVX512-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
1462; X64-AVX512-NEXT:    vmovupd %zmm1, {{.*}}(%rip)
1463; X64-AVX512-NEXT:    retq
1464entry:
1465  %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
1466  %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1467  %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0>
1468  store <4 x double> %0, <4 x double>* @ga2, align 8
1469  store <8 x double> %2, <8 x double>* @gb2, align 8
1470  ret void
1471}
1472