1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
5
6; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428
7; This is a larger-than-usual regression test to verify that several backend
8; transforms are working together. We want to hoist the expansion of non-uniform
9; vector shifts out of a loop if we do not have real vector shift instructions.
10; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that
11; sequence.
12
13define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind {
14; SSE-LABEL: vector_variable_shift_left_loop:
15; SSE:       # %bb.0: # %entry
16; SSE-NEXT:    testl %edx, %edx
17; SSE-NEXT:    jle .LBB0_9
18; SSE-NEXT:  # %bb.1: # %for.body.preheader
19; SSE-NEXT:    movl %ecx, %r9d
20; SSE-NEXT:    movl %edx, %eax
21; SSE-NEXT:    cmpl $31, %edx
22; SSE-NEXT:    ja .LBB0_3
23; SSE-NEXT:  # %bb.2:
24; SSE-NEXT:    xorl %edx, %edx
25; SSE-NEXT:    jmp .LBB0_6
26; SSE-NEXT:  .LBB0_3: # %vector.ph
27; SSE-NEXT:    movl %eax, %edx
28; SSE-NEXT:    andl $-32, %edx
29; SSE-NEXT:    movd %r9d, %xmm0
30; SSE-NEXT:    movd %r8d, %xmm1
31; SSE-NEXT:    xorl %ecx, %ecx
32; SSE-NEXT:    pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
33; SSE-NEXT:    pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
34; SSE-NEXT:    .p2align 4, 0x90
35; SSE-NEXT:  .LBB0_4: # %vector.body
36; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
37; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
38; SSE-NEXT:    movq {{.*#+}} xmm3 = mem[0],zero
39; SSE-NEXT:    movq {{.*#+}} xmm4 = mem[0],zero
40; SSE-NEXT:    movq {{.*#+}} xmm5 = mem[0],zero
41; SSE-NEXT:    pxor %xmm1, %xmm1
42; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
43; SSE-NEXT:    pmovsxbd %xmm0, %xmm7
44; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
45; SSE-NEXT:    pmovsxbd %xmm0, %xmm0
46; SSE-NEXT:    pcmpeqb %xmm1, %xmm3
47; SSE-NEXT:    pmovsxbd %xmm3, %xmm13
48; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
49; SSE-NEXT:    pmovsxbd %xmm3, %xmm6
50; SSE-NEXT:    pcmpeqb %xmm1, %xmm4
51; SSE-NEXT:    pmovsxbd %xmm4, %xmm11
52; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3]
53; SSE-NEXT:    pmovsxbd %xmm3, %xmm2
54; SSE-NEXT:    pcmpeqb %xmm1, %xmm5
55; SSE-NEXT:    pmovsxbd %xmm5, %xmm8
56; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3]
57; SSE-NEXT:    pmovsxbd %xmm3, %xmm9
58; SSE-NEXT:    movdqu 16(%rdi,%rcx,4), %xmm3
59; SSE-NEXT:    movdqa %xmm3, %xmm4
60; SSE-NEXT:    pslld %xmm15, %xmm4
61; SSE-NEXT:    pslld %xmm14, %xmm3
62; SSE-NEXT:    blendvps %xmm0, %xmm4, %xmm3
63; SSE-NEXT:    movdqu (%rdi,%rcx,4), %xmm10
64; SSE-NEXT:    movdqa %xmm10, %xmm5
65; SSE-NEXT:    pslld %xmm15, %xmm5
66; SSE-NEXT:    pslld %xmm14, %xmm10
67; SSE-NEXT:    movdqa %xmm7, %xmm0
68; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm10
69; SSE-NEXT:    movdqu 48(%rdi,%rcx,4), %xmm12
70; SSE-NEXT:    movdqa %xmm12, %xmm5
71; SSE-NEXT:    pslld %xmm15, %xmm5
72; SSE-NEXT:    pslld %xmm14, %xmm12
73; SSE-NEXT:    movdqa %xmm6, %xmm0
74; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm12
75; SSE-NEXT:    movdqu 32(%rdi,%rcx,4), %xmm6
76; SSE-NEXT:    movdqa %xmm6, %xmm5
77; SSE-NEXT:    pslld %xmm15, %xmm5
78; SSE-NEXT:    pslld %xmm14, %xmm6
79; SSE-NEXT:    movdqa %xmm13, %xmm0
80; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm6
81; SSE-NEXT:    movdqu 80(%rdi,%rcx,4), %xmm1
82; SSE-NEXT:    movdqa %xmm1, %xmm5
83; SSE-NEXT:    pslld %xmm15, %xmm5
84; SSE-NEXT:    pslld %xmm14, %xmm1
85; SSE-NEXT:    movdqa %xmm2, %xmm0
86; SSE-NEXT:    blendvps %xmm0, %xmm5, %xmm1
87; SSE-NEXT:    movdqu 64(%rdi,%rcx,4), %xmm5
88; SSE-NEXT:    movdqa %xmm5, %xmm2
89; SSE-NEXT:    pslld %xmm15, %xmm2
90; SSE-NEXT:    pslld %xmm14, %xmm5
91; SSE-NEXT:    movdqa %xmm11, %xmm0
92; SSE-NEXT:    blendvps %xmm0, %xmm2, %xmm5
93; SSE-NEXT:    movdqu 112(%rdi,%rcx,4), %xmm2
94; SSE-NEXT:    movdqa %xmm2, %xmm4
95; SSE-NEXT:    pslld %xmm15, %xmm4
96; SSE-NEXT:    pslld %xmm14, %xmm2
97; SSE-NEXT:    movdqa %xmm9, %xmm0
98; SSE-NEXT:    blendvps %xmm0, %xmm4, %xmm2
99; SSE-NEXT:    movdqu 96(%rdi,%rcx,4), %xmm4
100; SSE-NEXT:    movdqa %xmm4, %xmm7
101; SSE-NEXT:    pslld %xmm15, %xmm7
102; SSE-NEXT:    pslld %xmm14, %xmm4
103; SSE-NEXT:    movdqa %xmm8, %xmm0
104; SSE-NEXT:    blendvps %xmm0, %xmm7, %xmm4
105; SSE-NEXT:    movups %xmm10, (%rdi,%rcx,4)
106; SSE-NEXT:    movups %xmm3, 16(%rdi,%rcx,4)
107; SSE-NEXT:    movups %xmm6, 32(%rdi,%rcx,4)
108; SSE-NEXT:    movups %xmm12, 48(%rdi,%rcx,4)
109; SSE-NEXT:    movups %xmm5, 64(%rdi,%rcx,4)
110; SSE-NEXT:    movups %xmm1, 80(%rdi,%rcx,4)
111; SSE-NEXT:    movups %xmm4, 96(%rdi,%rcx,4)
112; SSE-NEXT:    movups %xmm2, 112(%rdi,%rcx,4)
113; SSE-NEXT:    addq $32, %rcx
114; SSE-NEXT:    cmpq %rcx, %rdx
115; SSE-NEXT:    jne .LBB0_4
116; SSE-NEXT:  # %bb.5: # %middle.block
117; SSE-NEXT:    cmpq %rax, %rdx
118; SSE-NEXT:    jne .LBB0_6
119; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
120; SSE-NEXT:    retq
121; SSE-NEXT:    .p2align 4, 0x90
122; SSE-NEXT:  .LBB0_8: # %for.body
123; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
124; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
125; SSE-NEXT:    shll %cl, (%rdi,%rdx,4)
126; SSE-NEXT:    incq %rdx
127; SSE-NEXT:    cmpq %rdx, %rax
128; SSE-NEXT:    je .LBB0_9
129; SSE-NEXT:  .LBB0_6: # %for.body
130; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
131; SSE-NEXT:    cmpb $0, (%rsi,%rdx)
132; SSE-NEXT:    movl %r9d, %ecx
133; SSE-NEXT:    je .LBB0_8
134; SSE-NEXT:  # %bb.7: # %for.body
135; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
136; SSE-NEXT:    movl %r8d, %ecx
137; SSE-NEXT:    jmp .LBB0_8
138;
139; AVX1-LABEL: vector_variable_shift_left_loop:
140; AVX1:       # %bb.0: # %entry
141; AVX1-NEXT:    subq $24, %rsp
142; AVX1-NEXT:    testl %edx, %edx
143; AVX1-NEXT:    jle .LBB0_9
144; AVX1-NEXT:  # %bb.1: # %for.body.preheader
145; AVX1-NEXT:    movl %ecx, %r9d
146; AVX1-NEXT:    movl %edx, %eax
147; AVX1-NEXT:    cmpl $31, %edx
148; AVX1-NEXT:    ja .LBB0_3
149; AVX1-NEXT:  # %bb.2:
150; AVX1-NEXT:    xorl %edx, %edx
151; AVX1-NEXT:    jmp .LBB0_6
152; AVX1-NEXT:  .LBB0_3: # %vector.ph
153; AVX1-NEXT:    movl %eax, %edx
154; AVX1-NEXT:    andl $-32, %edx
155; AVX1-NEXT:    vmovd %r9d, %xmm0
156; AVX1-NEXT:    vmovd %r8d, %xmm1
157; AVX1-NEXT:    xorl %ecx, %ecx
158; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
159; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
161; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
162; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
163; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
164; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero
165; AVX1-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
166; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero
167; AVX1-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
168; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero
169; AVX1-NEXT:    vpxor %xmm11, %xmm11, %xmm11
170; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
171; AVX1-NEXT:    .p2align 4, 0x90
172; AVX1-NEXT:  .LBB0_4: # %vector.body
173; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
174; AVX1-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
175; AVX1-NEXT:    # xmm1 = mem[0],zero,mem[1],zero
176; AVX1-NEXT:    vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
177; AVX1-NEXT:    # xmm2 = mem[0],zero,mem[1],zero
178; AVX1-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
179; AVX1-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
180; AVX1-NEXT:    vmovq {{.*#+}} xmm5 = mem[0],zero
181; AVX1-NEXT:    vmovq {{.*#+}} xmm6 = mem[0],zero
182; AVX1-NEXT:    vpcmpeqb %xmm11, %xmm3, %xmm3
183; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm7
184; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
185; AVX1-NEXT:    vpmovsxbd %xmm3, %xmm3
186; AVX1-NEXT:    vpcmpeqb %xmm11, %xmm4, %xmm4
187; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm8
188; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
189; AVX1-NEXT:    vpmovsxbd %xmm4, %xmm4
190; AVX1-NEXT:    vpcmpeqb %xmm11, %xmm5, %xmm5
191; AVX1-NEXT:    vmovdqu (%rdi,%rcx,4), %xmm9
192; AVX1-NEXT:    vpslld %xmm2, %xmm9, %xmm10
193; AVX1-NEXT:    vpslld %xmm1, %xmm9, %xmm0
194; AVX1-NEXT:    vblendvps %xmm7, %xmm10, %xmm0, %xmm9
195; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm7
196; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
197; AVX1-NEXT:    vpmovsxbd %xmm5, %xmm5
198; AVX1-NEXT:    vpcmpeqb %xmm11, %xmm6, %xmm6
199; AVX1-NEXT:    vmovdqu 16(%rdi,%rcx,4), %xmm0
200; AVX1-NEXT:    vpslld %xmm2, %xmm0, %xmm2
201; AVX1-NEXT:    vpslld %xmm1, %xmm0, %xmm0
202; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm1
203; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
204; AVX1-NEXT:    vpmovsxbd %xmm6, %xmm6
205; AVX1-NEXT:    vblendvps %xmm3, %xmm2, %xmm0, %xmm10
206; AVX1-NEXT:    vmovdqu 32(%rdi,%rcx,4), %xmm2
207; AVX1-NEXT:    vpslld %xmm15, %xmm2, %xmm3
208; AVX1-NEXT:    vpslld %xmm14, %xmm2, %xmm2
209; AVX1-NEXT:    vblendvps %xmm8, %xmm3, %xmm2, %xmm8
210; AVX1-NEXT:    vmovdqu 48(%rdi,%rcx,4), %xmm3
211; AVX1-NEXT:    vpslld %xmm15, %xmm3, %xmm0
212; AVX1-NEXT:    vpslld %xmm14, %xmm3, %xmm3
213; AVX1-NEXT:    vblendvps %xmm4, %xmm0, %xmm3, %xmm0
214; AVX1-NEXT:    vmovdqu 64(%rdi,%rcx,4), %xmm3
215; AVX1-NEXT:    vpslld %xmm13, %xmm3, %xmm4
216; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
217; AVX1-NEXT:    vpslld %xmm2, %xmm3, %xmm3
218; AVX1-NEXT:    vblendvps %xmm7, %xmm4, %xmm3, %xmm3
219; AVX1-NEXT:    vmovdqu 80(%rdi,%rcx,4), %xmm4
220; AVX1-NEXT:    vpslld %xmm13, %xmm4, %xmm7
221; AVX1-NEXT:    vpslld %xmm2, %xmm4, %xmm4
222; AVX1-NEXT:    vblendvps %xmm5, %xmm7, %xmm4, %xmm4
223; AVX1-NEXT:    vmovdqu 96(%rdi,%rcx,4), %xmm5
224; AVX1-NEXT:    vpslld %xmm12, %xmm5, %xmm7
225; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
226; AVX1-NEXT:    vpslld %xmm2, %xmm5, %xmm5
227; AVX1-NEXT:    vblendvps %xmm1, %xmm7, %xmm5, %xmm1
228; AVX1-NEXT:    vmovdqu 112(%rdi,%rcx,4), %xmm5
229; AVX1-NEXT:    vpslld %xmm12, %xmm5, %xmm7
230; AVX1-NEXT:    vpslld %xmm2, %xmm5, %xmm5
231; AVX1-NEXT:    vblendvps %xmm6, %xmm7, %xmm5, %xmm5
232; AVX1-NEXT:    vmovups %xmm9, (%rdi,%rcx,4)
233; AVX1-NEXT:    vmovups %xmm10, 16(%rdi,%rcx,4)
234; AVX1-NEXT:    vmovups %xmm8, 32(%rdi,%rcx,4)
235; AVX1-NEXT:    vmovups %xmm0, 48(%rdi,%rcx,4)
236; AVX1-NEXT:    vmovups %xmm3, 64(%rdi,%rcx,4)
237; AVX1-NEXT:    vmovups %xmm4, 80(%rdi,%rcx,4)
238; AVX1-NEXT:    vmovups %xmm1, 96(%rdi,%rcx,4)
239; AVX1-NEXT:    vmovups %xmm5, 112(%rdi,%rcx,4)
240; AVX1-NEXT:    addq $32, %rcx
241; AVX1-NEXT:    cmpq %rcx, %rdx
242; AVX1-NEXT:    jne .LBB0_4
243; AVX1-NEXT:  # %bb.5: # %middle.block
244; AVX1-NEXT:    cmpq %rax, %rdx
245; AVX1-NEXT:    jne .LBB0_6
246; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
247; AVX1-NEXT:    addq $24, %rsp
248; AVX1-NEXT:    vzeroupper
249; AVX1-NEXT:    retq
250; AVX1-NEXT:    .p2align 4, 0x90
251; AVX1-NEXT:  .LBB0_8: # %for.body
252; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
253; AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
254; AVX1-NEXT:    shll %cl, (%rdi,%rdx,4)
255; AVX1-NEXT:    incq %rdx
256; AVX1-NEXT:    cmpq %rdx, %rax
257; AVX1-NEXT:    je .LBB0_9
258; AVX1-NEXT:  .LBB0_6: # %for.body
259; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
260; AVX1-NEXT:    cmpb $0, (%rsi,%rdx)
261; AVX1-NEXT:    movl %r9d, %ecx
262; AVX1-NEXT:    je .LBB0_8
263; AVX1-NEXT:  # %bb.7: # %for.body
264; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
265; AVX1-NEXT:    movl %r8d, %ecx
266; AVX1-NEXT:    jmp .LBB0_8
267;
268; AVX2-LABEL: vector_variable_shift_left_loop:
269; AVX2:       # %bb.0: # %entry
270; AVX2-NEXT:    testl %edx, %edx
271; AVX2-NEXT:    jle .LBB0_9
272; AVX2-NEXT:  # %bb.1: # %for.body.preheader
273; AVX2-NEXT:    movl %ecx, %r9d
274; AVX2-NEXT:    movl %edx, %eax
275; AVX2-NEXT:    cmpl $31, %edx
276; AVX2-NEXT:    ja .LBB0_3
277; AVX2-NEXT:  # %bb.2:
278; AVX2-NEXT:    xorl %edx, %edx
279; AVX2-NEXT:    jmp .LBB0_6
280; AVX2-NEXT:  .LBB0_3: # %vector.ph
281; AVX2-NEXT:    movl %eax, %edx
282; AVX2-NEXT:    andl $-32, %edx
283; AVX2-NEXT:    vmovd %r9d, %xmm0
284; AVX2-NEXT:    vpbroadcastd %xmm0, %ymm0
285; AVX2-NEXT:    vmovd %r8d, %xmm1
286; AVX2-NEXT:    vpbroadcastd %xmm1, %ymm1
287; AVX2-NEXT:    xorl %ecx, %ecx
288; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
289; AVX2-NEXT:    .p2align 4, 0x90
290; AVX2-NEXT:  .LBB0_4: # %vector.body
291; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
292; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
293; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
294; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
295; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
296; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm3, %ymm3
297; AVX2-NEXT:    vblendvps %ymm3, %ymm0, %ymm1, %ymm3
298; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm4, %ymm4
299; AVX2-NEXT:    vblendvps %ymm4, %ymm0, %ymm1, %ymm4
300; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm5, %ymm5
301; AVX2-NEXT:    vblendvps %ymm5, %ymm0, %ymm1, %ymm5
302; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm6, %ymm6
303; AVX2-NEXT:    vblendvps %ymm6, %ymm0, %ymm1, %ymm6
304; AVX2-NEXT:    vmovdqu (%rdi,%rcx,4), %ymm7
305; AVX2-NEXT:    vpsllvd %ymm3, %ymm7, %ymm3
306; AVX2-NEXT:    vmovdqu 32(%rdi,%rcx,4), %ymm7
307; AVX2-NEXT:    vpsllvd %ymm4, %ymm7, %ymm4
308; AVX2-NEXT:    vmovdqu 64(%rdi,%rcx,4), %ymm7
309; AVX2-NEXT:    vpsllvd %ymm5, %ymm7, %ymm5
310; AVX2-NEXT:    vmovdqu 96(%rdi,%rcx,4), %ymm7
311; AVX2-NEXT:    vpsllvd %ymm6, %ymm7, %ymm6
312; AVX2-NEXT:    vmovdqu %ymm3, (%rdi,%rcx,4)
313; AVX2-NEXT:    vmovdqu %ymm4, 32(%rdi,%rcx,4)
314; AVX2-NEXT:    vmovdqu %ymm5, 64(%rdi,%rcx,4)
315; AVX2-NEXT:    vmovdqu %ymm6, 96(%rdi,%rcx,4)
316; AVX2-NEXT:    addq $32, %rcx
317; AVX2-NEXT:    cmpq %rcx, %rdx
318; AVX2-NEXT:    jne .LBB0_4
319; AVX2-NEXT:  # %bb.5: # %middle.block
320; AVX2-NEXT:    cmpq %rax, %rdx
321; AVX2-NEXT:    jne .LBB0_6
322; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
323; AVX2-NEXT:    vzeroupper
324; AVX2-NEXT:    retq
325; AVX2-NEXT:    .p2align 4, 0x90
326; AVX2-NEXT:  .LBB0_8: # %for.body
327; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
328; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
329; AVX2-NEXT:    shll %cl, (%rdi,%rdx,4)
330; AVX2-NEXT:    incq %rdx
331; AVX2-NEXT:    cmpq %rdx, %rax
332; AVX2-NEXT:    je .LBB0_9
333; AVX2-NEXT:  .LBB0_6: # %for.body
334; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
335; AVX2-NEXT:    cmpb $0, (%rsi,%rdx)
336; AVX2-NEXT:    movl %r9d, %ecx
337; AVX2-NEXT:    je .LBB0_8
338; AVX2-NEXT:  # %bb.7: # %for.body
339; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
340; AVX2-NEXT:    movl %r8d, %ecx
341; AVX2-NEXT:    jmp .LBB0_8
342entry:
343  %cmp12 = icmp sgt i32 %count, 0
344  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
345
346for.body.preheader:
347  %wide.trip.count = zext i32 %count to i64
348  %min.iters.check = icmp ult i32 %count, 32
349  br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph
350
351for.body.preheader40:
352  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
353  br label %for.body
354
355vector.ph:
356  %n.vec = and i64 %wide.trip.count, 4294967264
357  %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %amt0, i32 0
358  %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer
359  %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %amt1, i32 0
360  %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer
361  %broadcast.splatinsert24 = insertelement <8 x i32> undef, i32 %amt0, i32 0
362  %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> undef, <8 x i32> zeroinitializer
363  %broadcast.splatinsert26 = insertelement <8 x i32> undef, i32 %amt1, i32 0
364  %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> undef, <8 x i32> zeroinitializer
365  %broadcast.splatinsert28 = insertelement <8 x i32> undef, i32 %amt0, i32 0
366  %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> undef, <8 x i32> zeroinitializer
367  %broadcast.splatinsert30 = insertelement <8 x i32> undef, i32 %amt1, i32 0
368  %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> undef, <8 x i32> zeroinitializer
369  %broadcast.splatinsert32 = insertelement <8 x i32> undef, i32 %amt0, i32 0
370  %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> undef, <8 x i32> zeroinitializer
371  %broadcast.splatinsert34 = insertelement <8 x i32> undef, i32 %amt1, i32 0
372  %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> undef, <8 x i32> zeroinitializer
373  br label %vector.body
374
375vector.body:
376  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
377  %0 = getelementptr inbounds i8, i8* %control, i64 %index
378  %1 = bitcast i8* %0 to <8 x i8>*
379  %wide.load = load <8 x i8>, <8 x i8>* %1, align 1
380  %2 = getelementptr inbounds i8, i8* %0, i64 8
381  %3 = bitcast i8* %2 to <8 x i8>*
382  %wide.load17 = load <8 x i8>, <8 x i8>* %3, align 1
383  %4 = getelementptr inbounds i8, i8* %0, i64 16
384  %5 = bitcast i8* %4 to <8 x i8>*
385  %wide.load18 = load <8 x i8>, <8 x i8>* %5, align 1
386  %6 = getelementptr inbounds i8, i8* %0, i64 24
387  %7 = bitcast i8* %6 to <8 x i8>*
388  %wide.load19 = load <8 x i8>, <8 x i8>* %7, align 1
389  %8 = icmp eq <8 x i8> %wide.load, zeroinitializer
390  %9 = icmp eq <8 x i8> %wide.load17, zeroinitializer
391  %10 = icmp eq <8 x i8> %wide.load18, zeroinitializer
392  %11 = icmp eq <8 x i8> %wide.load19, zeroinitializer
393  %12 = select <8 x i1> %8, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23
394  %13 = select <8 x i1> %9, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27
395  %14 = select <8 x i1> %10, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31
396  %15 = select <8 x i1> %11, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35
397  %16 = getelementptr inbounds i32, i32* %arr, i64 %index
398  %17 = bitcast i32* %16 to <8 x i32>*
399  %wide.load36 = load <8 x i32>, <8 x i32>* %17, align 4
400  %18 = getelementptr inbounds i32, i32* %16, i64 8
401  %19 = bitcast i32* %18 to <8 x i32>*
402  %wide.load37 = load <8 x i32>, <8 x i32>* %19, align 4
403  %20 = getelementptr inbounds i32, i32* %16, i64 16
404  %21 = bitcast i32* %20 to <8 x i32>*
405  %wide.load38 = load <8 x i32>, <8 x i32>* %21, align 4
406  %22 = getelementptr inbounds i32, i32* %16, i64 24
407  %23 = bitcast i32* %22 to <8 x i32>*
408  %wide.load39 = load <8 x i32>, <8 x i32>* %23, align 4
409  %24 = shl <8 x i32> %wide.load36, %12
410  %25 = shl <8 x i32> %wide.load37, %13
411  %26 = shl <8 x i32> %wide.load38, %14
412  %27 = shl <8 x i32> %wide.load39, %15
413  %28 = bitcast i32* %16 to <8 x i32>*
414  store <8 x i32> %24, <8 x i32>* %28, align 4
415  %29 = bitcast i32* %18 to <8 x i32>*
416  store <8 x i32> %25, <8 x i32>* %29, align 4
417  %30 = bitcast i32* %20 to <8 x i32>*
418  store <8 x i32> %26, <8 x i32>* %30, align 4
419  %31 = bitcast i32* %22 to <8 x i32>*
420  store <8 x i32> %27, <8 x i32>* %31, align 4
421  %index.next = add i64 %index, 32
422  %32 = icmp eq i64 %index.next, %n.vec
423  br i1 %32, label %middle.block, label %vector.body
424
425middle.block:
426  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
427  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40
428
429for.cond.cleanup:
430  ret void
431
432for.body:
433  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ]
434  %arrayidx = getelementptr inbounds i8, i8* %control, i64 %indvars.iv
435  %33 = load i8, i8* %arrayidx, align 1
436  %tobool = icmp eq i8 %33, 0
437  %cond = select i1 %tobool, i32 %amt0, i32 %amt1
438  %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
439  %34 = load i32, i32* %arrayidx2, align 4
440  %shl = shl i32 %34, %cond
441  store i32 %shl, i32* %arrayidx2, align 4
442  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
443  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
444  br i1 %exitcond, label %for.cond.cleanup, label %for.body
445}
446
447define void @vector_variable_shift_left_loop_simpler(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind {
448; SSE-LABEL: vector_variable_shift_left_loop_simpler:
449; SSE:       # %bb.0: # %entry
450; SSE-NEXT:    testl %edx, %edx
451; SSE-NEXT:    jle .LBB1_3
452; SSE-NEXT:  # %bb.1: # %vector.ph
453; SSE-NEXT:    movl %edx, %eax
454; SSE-NEXT:    andl $-4, %eax
455; SSE-NEXT:    movd %ecx, %xmm0
456; SSE-NEXT:    movd %r8d, %xmm2
457; SSE-NEXT:    movd %r9d, %xmm3
458; SSE-NEXT:    xorl %ecx, %ecx
459; SSE-NEXT:    pslld $23, %xmm0
460; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
461; SSE-NEXT:    paddd %xmm4, %xmm0
462; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
463; SSE-NEXT:    pmulld %xmm3, %xmm0
464; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
465; SSE-NEXT:    pslld $23, %xmm2
466; SSE-NEXT:    paddd %xmm4, %xmm2
467; SSE-NEXT:    cvttps2dq %xmm2, %xmm0
468; SSE-NEXT:    pmulld %xmm3, %xmm0
469; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
470; SSE-NEXT:    pxor %xmm3, %xmm3
471; SSE-NEXT:    .p2align 4, 0x90
472; SSE-NEXT:  .LBB1_2: # %vector.body
473; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
474; SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
475; SSE-NEXT:    pcmpeqd %xmm3, %xmm0
476; SSE-NEXT:    movdqa %xmm2, %xmm4
477; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm4
478; SSE-NEXT:    movups %xmm4, (%rdi,%rcx,4)
479; SSE-NEXT:    addq $4, %rcx
480; SSE-NEXT:    cmpq %rcx, %rax
481; SSE-NEXT:    jne .LBB1_2
482; SSE-NEXT:  .LBB1_3: # %exit
483; SSE-NEXT:    retq
484;
485; AVX1-LABEL: vector_variable_shift_left_loop_simpler:
486; AVX1:       # %bb.0: # %entry
487; AVX1-NEXT:    testl %edx, %edx
488; AVX1-NEXT:    jle .LBB1_3
489; AVX1-NEXT:  # %bb.1: # %vector.ph
490; AVX1-NEXT:    movl %edx, %eax
491; AVX1-NEXT:    andl $-4, %eax
492; AVX1-NEXT:    vmovd %ecx, %xmm0
493; AVX1-NEXT:    vmovd %r8d, %xmm1
494; AVX1-NEXT:    vmovd %r9d, %xmm2
495; AVX1-NEXT:    xorl %ecx, %ecx
496; AVX1-NEXT:    vpslld $23, %xmm0, %xmm0
497; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
498; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
499; AVX1-NEXT:    vcvttps2dq %xmm0, %xmm0
500; AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
501; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
502; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
503; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
504; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
505; AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
506; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
507; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
508; AVX1-NEXT:    .p2align 4, 0x90
509; AVX1-NEXT:  .LBB1_2: # %vector.body
510; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
511; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
512; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm3, %xmm3
513; AVX1-NEXT:    vblendvps %xmm3, %xmm0, %xmm1, %xmm3
514; AVX1-NEXT:    vmovups %xmm3, (%rdi,%rcx,4)
515; AVX1-NEXT:    addq $4, %rcx
516; AVX1-NEXT:    cmpq %rcx, %rax
517; AVX1-NEXT:    jne .LBB1_2
518; AVX1-NEXT:  .LBB1_3: # %exit
519; AVX1-NEXT:    retq
520;
521; AVX2-LABEL: vector_variable_shift_left_loop_simpler:
522; AVX2:       # %bb.0: # %entry
523; AVX2-NEXT:    testl %edx, %edx
524; AVX2-NEXT:    jle .LBB1_3
525; AVX2-NEXT:  # %bb.1: # %vector.ph
526; AVX2-NEXT:    movl %edx, %eax
527; AVX2-NEXT:    andl $-4, %eax
528; AVX2-NEXT:    vmovd %ecx, %xmm0
529; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
530; AVX2-NEXT:    vmovd %r8d, %xmm1
531; AVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
532; AVX2-NEXT:    vmovd %r9d, %xmm2
533; AVX2-NEXT:    vpbroadcastd %xmm2, %xmm2
534; AVX2-NEXT:    xorl %ecx, %ecx
535; AVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
536; AVX2-NEXT:    .p2align 4, 0x90
537; AVX2-NEXT:  .LBB1_2: # %vector.body
538; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
539; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
540; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm4, %xmm4
541; AVX2-NEXT:    vblendvps %xmm4, %xmm0, %xmm1, %xmm4
542; AVX2-NEXT:    vpsllvd %xmm4, %xmm2, %xmm4
543; AVX2-NEXT:    vmovdqu %xmm4, (%rdi,%rcx,4)
544; AVX2-NEXT:    addq $4, %rcx
545; AVX2-NEXT:    cmpq %rcx, %rax
546; AVX2-NEXT:    jne .LBB1_2
547; AVX2-NEXT:  .LBB1_3: # %exit
548; AVX2-NEXT:    retq
549entry:
550  %cmp16 = icmp sgt i32 %count, 0
551  %wide.trip.count = zext i32 %count to i64
552  br i1 %cmp16, label %vector.ph, label %exit
553
554vector.ph:
555  %n.vec = and i64 %wide.trip.count, 4294967292
556  %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0
557  %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
558  %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0
559  %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer
560  %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0
561  %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
562  br label %vector.body
563
564vector.body:
565  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
566  %0 = getelementptr inbounds i8, i8* %control, i64 %index
567  %1 = bitcast i8* %0 to <4 x i8>*
568  %wide.load = load <4 x i8>, <4 x i8>* %1, align 1
569  %2 = icmp eq <4 x i8> %wide.load, zeroinitializer
570  %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2
571  %4 = shl <4 x i32> %splat3, %3
572  %5 = getelementptr inbounds i32, i32* %arr, i64 %index
573  %6 = bitcast i32* %5 to <4 x i32>*
574  store <4 x i32> %4, <4 x i32>* %6, align 4
575  %index.next = add i64 %index, 4
576  %7 = icmp eq i64 %index.next, %n.vec
577  br i1 %7, label %exit, label %vector.body
578
579exit:
580  ret void
581}
582