1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
4
5define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
6; SSE2-LABEL: foo:
7; SSE2:       # BB#0:
8; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
9; SSE2-NEXT:    movl $255, %eax
10; SSE2-NEXT:    movd %eax, %xmm1
11; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
12; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
13; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
14; SSE2-NEXT:    packuswb %xmm0, %xmm0
15; SSE2-NEXT:    packuswb %xmm0, %xmm0
16; SSE2-NEXT:    movd %xmm0, (%rdi)
17; SSE2-NEXT:    retq
18;
19; SSE41-LABEL: foo:
20; SSE41:       # BB#0:
21; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
22; SSE41-NEXT:    movl $255, %eax
23; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
24; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
25; SSE41-NEXT:    movd %xmm0, (%rdi)
26; SSE41-NEXT:    retq
27  %t0 = fptoui <3 x float> %in to <3 x i8>
28  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
29  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
30  store <4 x i8> %t2, <4 x i8>* %out, align 4
31  ret void
32}
33
34; Verify that the DAGCombiner doesn't wrongly fold a build_vector into a
35; blend with a zero vector if the build_vector contains negative zero.
36
37define <4 x float> @test_negative_zero_1(<4 x float> %A) {
38; SSE2-LABEL: test_negative_zero_1:
39; SSE2:       # BB#0: # %entry
40; SSE2-NEXT:    movaps %xmm0, %xmm1
41; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
42; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
43; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
44; SSE2-NEXT:    xorps %xmm2, %xmm2
45; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
46; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
47; SSE2-NEXT:    retq
48;
49; SSE41-LABEL: test_negative_zero_1:
50; SSE41:       # BB#0: # %entry
51; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero
52; SSE41-NEXT:    retq
53entry:
54  %0 = extractelement <4 x float> %A, i32 0
55  %1 = insertelement <4 x float> undef, float %0, i32 0
56  %2 = insertelement <4 x float> %1, float -0.0, i32 1
57  %3 = extractelement <4 x float> %A, i32 2
58  %4 = insertelement <4 x float> %2, float %3, i32 2
59  %5 = insertelement <4 x float> %4, float 0.0, i32 3
60  ret <4 x float> %5
61}
62
63define <2 x double> @test_negative_zero_2(<2 x double> %A) {
64; CHECK-LABEL: test_negative_zero_2:
65; CHECK:       # BB#0: # %entry
66; CHECK-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
67; CHECK-NEXT:    retq
68entry:
69  %0 = extractelement <2 x double> %A, i32 0
70  %1 = insertelement <2 x double> undef, double %0, i32 0
71  %2 = insertelement <2 x double> %1, double -0.0, i32 1
72  ret <2 x double> %2
73}
74
75define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) {
76; SSE2-LABEL: test_buildvector_v4f32_register:
77; SSE2:       # BB#0:
78; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
79; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
80; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
81; SSE2-NEXT:    retq
82;
83; SSE41-LABEL: test_buildvector_v4f32_register:
84; SSE41:       # BB#0:
85; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
86; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
87; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
88; SSE41-NEXT:    retq
89  %ins0 = insertelement <4 x float> undef, float %f0, i32 0
90  %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1
91  %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2
92  %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3
93  ret <4 x float> %ins3
94}
95
96define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %p2, float* %p3) {
97; SSE2-LABEL: test_buildvector_v4f32_load:
98; SSE2:       # BB#0:
99; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
100; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
101; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
102; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
103; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
104; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
105; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
106; SSE2-NEXT:    retq
107;
108; SSE41-LABEL: test_buildvector_v4f32_load:
109; SSE41:       # BB#0:
110; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
111; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
112; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
113; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
114; SSE41-NEXT:    retq
115  %f0 = load float, float* %p0, align 4
116  %f1 = load float, float* %p1, align 4
117  %f2 = load float, float* %p2, align 4
118  %f3 = load float, float* %p3, align 4
119  %ins0 = insertelement <4 x float> undef, float %f0, i32 0
120  %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1
121  %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2
122  %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3
123  ret <4 x float> %ins3
124}
125
126define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) {
127; SSE2-LABEL: test_buildvector_v4f32_partial_load:
128; SSE2:       # BB#0:
129; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
130; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
131; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
132; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
133; SSE2-NEXT:    retq
134;
135; SSE41-LABEL: test_buildvector_v4f32_partial_load:
136; SSE41:       # BB#0:
137; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
138; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
139; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
140; SSE41-NEXT:    retq
141  %f3 = load float, float* %p3, align 4
142  %ins0 = insertelement <4 x float> undef, float %f0, i32 0
143  %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1
144  %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2
145  %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3
146  ret <4 x float> %ins3
147}
148
149define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
150; SSE2-LABEL: test_buildvector_v4i32_register:
151; SSE2:       # BB#0:
152; SSE2-NEXT:    movd %ecx, %xmm0
153; SSE2-NEXT:    movd %edx, %xmm1
154; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
155; SSE2-NEXT:    movd %esi, %xmm2
156; SSE2-NEXT:    movd %edi, %xmm0
157; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
158; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
159; SSE2-NEXT:    retq
160;
161; SSE41-LABEL: test_buildvector_v4i32_register:
162; SSE41:       # BB#0:
163; SSE41-NEXT:    movd %edi, %xmm0
164; SSE41-NEXT:    pinsrd $1, %esi, %xmm0
165; SSE41-NEXT:    pinsrd $2, %edx, %xmm0
166; SSE41-NEXT:    pinsrd $3, %ecx, %xmm0
167; SSE41-NEXT:    retq
168  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
169  %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
170  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
171  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
172  ret <4 x i32> %ins3
173}
174
175define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) {
176; SSE2-LABEL: test_buildvector_v4i32_partial:
177; SSE2:       # BB#0:
178; SSE2-NEXT:    movd %edi, %xmm0
179; SSE2-NEXT:    movd %esi, %xmm1
180; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
181; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
182; SSE2-NEXT:    retq
183;
184; SSE41-LABEL: test_buildvector_v4i32_partial:
185; SSE41:       # BB#0:
186; SSE41-NEXT:    movd %edi, %xmm0
187; SSE41-NEXT:    pinsrd $3, %esi, %xmm0
188; SSE41-NEXT:    retq
189  %ins0 = insertelement <4 x i32> undef, i32   %a0, i32 0
190  %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1
191  %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2
192  %ins3 = insertelement <4 x i32> %ins2, i32   %a3, i32 3
193  ret <4 x i32> %ins3
194}
195
196define <4 x i32> @test_buildvector_v4i32_register_zero(i32 %a0, i32 %a2, i32 %a3) {
197; CHECK-LABEL: test_buildvector_v4i32_register_zero:
198; CHECK:       # BB#0:
199; CHECK-NEXT:    movd %edx, %xmm0
200; CHECK-NEXT:    movd %esi, %xmm1
201; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
202; CHECK-NEXT:    movd %edi, %xmm0
203; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
204; CHECK-NEXT:    retq
205  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
206  %ins1 = insertelement <4 x i32> %ins0, i32   0, i32 1
207  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
208  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
209  ret <4 x i32> %ins3
210}
211
212define <4 x i32> @test_buildvector_v4i32_register_zero_2(i32 %a1, i32 %a2, i32 %a3) {
213; CHECK-LABEL: test_buildvector_v4i32_register_zero_2:
214; CHECK:       # BB#0:
215; CHECK-NEXT:    movd %edx, %xmm0
216; CHECK-NEXT:    movd %esi, %xmm1
217; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
218; CHECK-NEXT:    movd %edi, %xmm0
219; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
220; CHECK-NEXT:    retq
221  %ins0 = insertelement <4 x i32> undef, i32   0, i32 0
222  %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
223  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
224  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
225  ret <4 x i32> %ins3
226}
227
228define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) {
229; SSE2-LABEL: test_buildvector_v8i16_register:
230; SSE2:       # BB#0:
231; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
232; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
233; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
234; SSE2-NEXT:    movd %r9d, %xmm0
235; SSE2-NEXT:    movd %r8d, %xmm2
236; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
237; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
238; SSE2-NEXT:    movd %ecx, %xmm0
239; SSE2-NEXT:    movd %edx, %xmm1
240; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
241; SSE2-NEXT:    movd %esi, %xmm3
242; SSE2-NEXT:    movd %edi, %xmm0
243; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
244; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
245; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
246; SSE2-NEXT:    retq
247;
248; SSE41-LABEL: test_buildvector_v8i16_register:
249; SSE41:       # BB#0:
250; SSE41-NEXT:    movd %edi, %xmm0
251; SSE41-NEXT:    pinsrw $1, %esi, %xmm0
252; SSE41-NEXT:    pinsrw $2, %edx, %xmm0
253; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
254; SSE41-NEXT:    pinsrw $4, %r8d, %xmm0
255; SSE41-NEXT:    pinsrw $5, %r9d, %xmm0
256; SSE41-NEXT:    pinsrw $6, {{[0-9]+}}(%rsp), %xmm0
257; SSE41-NEXT:    pinsrw $7, {{[0-9]+}}(%rsp), %xmm0
258; SSE41-NEXT:    retq
259  %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
260  %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1
261  %ins2 = insertelement <8 x i16> %ins1, i16 %a2, i32 2
262  %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3
263  %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4
264  %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5
265  %ins6 = insertelement <8 x i16> %ins5, i16 %a6, i32 6
266  %ins7 = insertelement <8 x i16> %ins6, i16 %a7, i32 7
267  ret <8 x i16> %ins7
268}
269
270define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
271; CHECK-LABEL: test_buildvector_v8i16_partial:
272; CHECK:       # BB#0:
273; CHECK-NEXT:    pxor %xmm0, %xmm0
274; CHECK-NEXT:    pinsrw $1, %edi, %xmm0
275; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
276; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
277; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
278; CHECK-NEXT:    retq
279  %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0
280  %ins1 = insertelement <8 x i16> %ins0, i16   %a1, i32 1
281  %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2
282  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
283  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
284  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
285  %ins6 = insertelement <8 x i16> %ins5, i16 undef, i32 6
286  %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7
287  ret <8 x i16> %ins7
288}
289
290define <8 x i16> @test_buildvector_v8i16_register_zero(i16 %a0, i16 %a3, i16 %a4, i16 %a5) {
291; CHECK-LABEL: test_buildvector_v8i16_register_zero:
292; CHECK:       # BB#0:
293; CHECK-NEXT:    pxor %xmm0, %xmm0
294; CHECK-NEXT:    pinsrw $0, %edi, %xmm0
295; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
296; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
297; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
298; CHECK-NEXT:    retq
299  %ins0 = insertelement <8 x i16> undef, i16   %a0, i32 0
300  %ins1 = insertelement <8 x i16> %ins0, i16     0, i32 1
301  %ins2 = insertelement <8 x i16> %ins1, i16     0, i32 2
302  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
303  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
304  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
305  %ins6 = insertelement <8 x i16> %ins5, i16     0, i32 6
306  %ins7 = insertelement <8 x i16> %ins6, i16     0, i32 7
307  ret <8 x i16> %ins7
308}
309
310define <8 x i16> @test_buildvector_v8i16_register_zero_2(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
311; CHECK-LABEL: test_buildvector_v8i16_register_zero_2:
312; CHECK:       # BB#0:
313; CHECK-NEXT:    pxor %xmm0, %xmm0
314; CHECK-NEXT:    pinsrw $1, %edi, %xmm0
315; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
316; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
317; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
318; CHECK-NEXT:    retq
319  %ins0 = insertelement <8 x i16> undef, i16     0, i32 0
320  %ins1 = insertelement <8 x i16> %ins0, i16   %a1, i32 1
321  %ins2 = insertelement <8 x i16> %ins1, i16     0, i32 2
322  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
323  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
324  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
325  %ins6 = insertelement <8 x i16> %ins5, i16     0, i32 6
326  %ins7 = insertelement <8 x i16> %ins6, i16     0, i32 7
327  ret <8 x i16> %ins7
328}
329
330define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) {
331; SSE2-LABEL: test_buildvector_v16i8_register:
332; SSE2:       # BB#0:
333; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
334; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
335; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
336; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
337; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
338; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
339; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
340; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
341; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
342; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
343; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
344; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
345; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
346; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
347; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
348; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
349; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
350; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
351; SSE2-NEXT:    movd %r9d, %xmm0
352; SSE2-NEXT:    movd %r8d, %xmm2
353; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
354; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
355; SSE2-NEXT:    movd %ecx, %xmm0
356; SSE2-NEXT:    movd %edx, %xmm1
357; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
358; SSE2-NEXT:    movd %esi, %xmm4
359; SSE2-NEXT:    movd %edi, %xmm0
360; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
361; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
362; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
363; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
364; SSE2-NEXT:    retq
365;
366; SSE41-LABEL: test_buildvector_v16i8_register:
367; SSE41:       # BB#0:
368; SSE41-NEXT:    movd %edi, %xmm0
369; SSE41-NEXT:    pinsrb $1, %esi, %xmm0
370; SSE41-NEXT:    pinsrb $2, %edx, %xmm0
371; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
372; SSE41-NEXT:    pinsrb $4, %r8d, %xmm0
373; SSE41-NEXT:    pinsrb $5, %r9d, %xmm0
374; SSE41-NEXT:    pinsrb $6, {{[0-9]+}}(%rsp), %xmm0
375; SSE41-NEXT:    pinsrb $7, {{[0-9]+}}(%rsp), %xmm0
376; SSE41-NEXT:    pinsrb $8, {{[0-9]+}}(%rsp), %xmm0
377; SSE41-NEXT:    pinsrb $9, {{[0-9]+}}(%rsp), %xmm0
378; SSE41-NEXT:    pinsrb $10, {{[0-9]+}}(%rsp), %xmm0
379; SSE41-NEXT:    pinsrb $11, {{[0-9]+}}(%rsp), %xmm0
380; SSE41-NEXT:    pinsrb $12, {{[0-9]+}}(%rsp), %xmm0
381; SSE41-NEXT:    pinsrb $13, {{[0-9]+}}(%rsp), %xmm0
382; SSE41-NEXT:    pinsrb $14, {{[0-9]+}}(%rsp), %xmm0
383; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
384; SSE41-NEXT:    retq
385  %ins0  = insertelement <16 x i8> undef,  i8 %a0,  i32 0
386  %ins1  = insertelement <16 x i8> %ins0,  i8 %a1,  i32 1
387  %ins2  = insertelement <16 x i8> %ins1,  i8 %a2,  i32 2
388  %ins3  = insertelement <16 x i8> %ins2,  i8 %a3,  i32 3
389  %ins4  = insertelement <16 x i8> %ins3,  i8 %a4,  i32 4
390  %ins5  = insertelement <16 x i8> %ins4,  i8 %a5,  i32 5
391  %ins6  = insertelement <16 x i8> %ins5,  i8 %a6,  i32 6
392  %ins7  = insertelement <16 x i8> %ins6,  i8 %a7,  i32 7
393  %ins8  = insertelement <16 x i8> %ins7,  i8 %a8,  i32 8
394  %ins9  = insertelement <16 x i8> %ins8,  i8 %a9,  i32 9
395  %ins10 = insertelement <16 x i8> %ins9,  i8 %a10, i32 10
396  %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11
397  %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12
398  %ins13 = insertelement <16 x i8> %ins12, i8 %a13, i32 13
399  %ins14 = insertelement <16 x i8> %ins13, i8 %a14, i32 14
400  %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
401  ret <16 x i8> %ins15
402}
403
404define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
405; SSE2-LABEL: test_buildvector_v16i8_partial:
406; SSE2:       # BB#0:
407; SSE2-NEXT:    movzbl %dil, %eax
408; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
409; SSE2-NEXT:    movzbl %sil, %eax
410; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
411; SSE2-NEXT:    movzbl %dl, %eax
412; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
413; SSE2-NEXT:    shll $8, %ecx
414; SSE2-NEXT:    pinsrw $5, %ecx, %xmm0
415; SSE2-NEXT:    movzbl %r8b, %eax
416; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
417; SSE2-NEXT:    shll $8, %r9d
418; SSE2-NEXT:    pinsrw $7, %r9d, %xmm0
419; SSE2-NEXT:    retq
420;
421; SSE41-LABEL: test_buildvector_v16i8_partial:
422; SSE41:       # BB#0:
423; SSE41-NEXT:    pxor %xmm0, %xmm0
424; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
425; SSE41-NEXT:    pinsrb $6, %esi, %xmm0
426; SSE41-NEXT:    pinsrb $8, %edx, %xmm0
427; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
428; SSE41-NEXT:    pinsrb $12, %r8d, %xmm0
429; SSE41-NEXT:    pinsrb $15, %r9d, %xmm0
430; SSE41-NEXT:    retq
431  %ins0  = insertelement <16 x i8> undef,  i8 undef, i32 0
432  %ins1  = insertelement <16 x i8> %ins0,  i8 undef, i32 1
433  %ins2  = insertelement <16 x i8> %ins1,  i8   %a2, i32 2
434  %ins3  = insertelement <16 x i8> %ins2,  i8 undef, i32 3
435  %ins4  = insertelement <16 x i8> %ins3,  i8 undef, i32 4
436  %ins5  = insertelement <16 x i8> %ins4,  i8 undef, i32 5
437  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
438  %ins7  = insertelement <16 x i8> %ins6,  i8 undef, i32 7
439  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
440  %ins9  = insertelement <16 x i8> %ins8,  i8 undef, i32 9
441  %ins10 = insertelement <16 x i8> %ins9,  i8 undef, i32 10
442  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
443  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
444  %ins13 = insertelement <16 x i8> %ins12, i8 undef, i32 13
445  %ins14 = insertelement <16 x i8> %ins13, i8 undef, i32 14
446  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
447  ret <16 x i8> %ins15
448}
449
450define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
451; SSE2-LABEL: test_buildvector_v16i8_register_zero:
452; SSE2:       # BB#0:
453; SSE2-NEXT:    movzbl %sil, %eax
454; SSE2-NEXT:    movzbl %dil, %esi
455; SSE2-NEXT:    movd %esi, %xmm0
456; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
457; SSE2-NEXT:    movzbl %dl, %eax
458; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
459; SSE2-NEXT:    movzbl %cl, %eax
460; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
461; SSE2-NEXT:    shll $8, %r8d
462; SSE2-NEXT:    pinsrw $5, %r8d, %xmm0
463; SSE2-NEXT:    movzbl %r9b, %eax
464; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
465; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
466; SSE2-NEXT:    shll $8, %eax
467; SSE2-NEXT:    pinsrw $7, %eax, %xmm0
468; SSE2-NEXT:    retq
469;
470; SSE41-LABEL: test_buildvector_v16i8_register_zero:
471; SSE41:       # BB#0:
472; SSE41-NEXT:    pxor %xmm0, %xmm0
473; SSE41-NEXT:    pinsrb $0, %edi, %xmm0
474; SSE41-NEXT:    pinsrb $4, %esi, %xmm0
475; SSE41-NEXT:    pinsrb $6, %edx, %xmm0
476; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
477; SSE41-NEXT:    pinsrb $11, %r8d, %xmm0
478; SSE41-NEXT:    pinsrb $12, %r9d, %xmm0
479; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
480; SSE41-NEXT:    retq
481  %ins0  = insertelement <16 x i8> undef,  i8   %a0, i32 0
482  %ins1  = insertelement <16 x i8> %ins0,  i8     0, i32 1
483  %ins2  = insertelement <16 x i8> %ins1,  i8     0, i32 2
484  %ins3  = insertelement <16 x i8> %ins2,  i8     0, i32 3
485  %ins4  = insertelement <16 x i8> %ins3,  i8   %a4, i32 4
486  %ins5  = insertelement <16 x i8> %ins4,  i8     0, i32 5
487  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
488  %ins7  = insertelement <16 x i8> %ins6,  i8     0, i32 7
489  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
490  %ins9  = insertelement <16 x i8> %ins8,  i8     0, i32 9
491  %ins10 = insertelement <16 x i8> %ins9,  i8     0, i32 10
492  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
493  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
494  %ins13 = insertelement <16 x i8> %ins12, i8     0, i32 13
495  %ins14 = insertelement <16 x i8> %ins13, i8     0, i32 14
496  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
497  ret <16 x i8> %ins15
498}
499
500define <16 x i8> @test_buildvector_v16i8_register_zero_2(i8 %a2, i8 %a3, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
501; SSE2-LABEL: test_buildvector_v16i8_register_zero_2:
502; SSE2:       # BB#0:
503; SSE2-NEXT:    shll $8, %esi
504; SSE2-NEXT:    movzbl %dil, %eax
505; SSE2-NEXT:    orl %esi, %eax
506; SSE2-NEXT:    pxor %xmm0, %xmm0
507; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
508; SSE2-NEXT:    movzbl %dl, %eax
509; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
510; SSE2-NEXT:    movzbl %cl, %eax
511; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
512; SSE2-NEXT:    shll $8, %r8d
513; SSE2-NEXT:    pinsrw $5, %r8d, %xmm0
514; SSE2-NEXT:    movzbl %r9b, %eax
515; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
516; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
517; SSE2-NEXT:    shll $8, %eax
518; SSE2-NEXT:    pinsrw $7, %eax, %xmm0
519; SSE2-NEXT:    retq
520;
521; SSE41-LABEL: test_buildvector_v16i8_register_zero_2:
522; SSE41:       # BB#0:
523; SSE41-NEXT:    pxor %xmm0, %xmm0
524; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
525; SSE41-NEXT:    pinsrb $3, %esi, %xmm0
526; SSE41-NEXT:    pinsrb $6, %edx, %xmm0
527; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
528; SSE41-NEXT:    pinsrb $11, %r8d, %xmm0
529; SSE41-NEXT:    pinsrb $12, %r9d, %xmm0
530; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
531; SSE41-NEXT:    retq
532  %ins0  = insertelement <16 x i8> undef,  i8     0, i32 0
533  %ins1  = insertelement <16 x i8> %ins0,  i8     0, i32 1
534  %ins2  = insertelement <16 x i8> %ins1,  i8   %a2, i32 2
535  %ins3  = insertelement <16 x i8> %ins2,  i8   %a3, i32 3
536  %ins4  = insertelement <16 x i8> %ins3,  i8     0, i32 4
537  %ins5  = insertelement <16 x i8> %ins4,  i8     0, i32 5
538  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
539  %ins7  = insertelement <16 x i8> %ins6,  i8     0, i32 7
540  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
541  %ins9  = insertelement <16 x i8> %ins8,  i8     0, i32 9
542  %ins10 = insertelement <16 x i8> %ins9,  i8     0, i32 10
543  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
544  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
545  %ins13 = insertelement <16 x i8> %ins12, i8     0, i32 13
546  %ins14 = insertelement <16 x i8> %ins13, i8     0, i32 14
547  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
548  ret <16 x i8> %ins15
549}
550