1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 5 6; PR37428 - https://bugs.llvm.org/show_bug.cgi?id=37428 7; This is a larger-than-usual regression test to verify that several backend 8; transforms are working together. We want to hoist the expansion of non-uniform 9; vector shifts out of a loop if we do not have real vector shift instructions. 10; See test/Transforms/CodeGenPrepare/X86/vec-shift.ll for the 1st step in that 11; sequence. 12 13define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1) nounwind { 14; SSE-LABEL: vector_variable_shift_left_loop: 15; SSE: # %bb.0: # %entry 16; SSE-NEXT: testl %edx, %edx 17; SSE-NEXT: jle .LBB0_9 18; SSE-NEXT: # %bb.1: # %for.body.preheader 19; SSE-NEXT: movl %ecx, %r9d 20; SSE-NEXT: movl %edx, %eax 21; SSE-NEXT: cmpl $31, %edx 22; SSE-NEXT: ja .LBB0_3 23; SSE-NEXT: # %bb.2: 24; SSE-NEXT: xorl %edx, %edx 25; SSE-NEXT: jmp .LBB0_6 26; SSE-NEXT: .LBB0_3: # %vector.ph 27; SSE-NEXT: movl %eax, %edx 28; SSE-NEXT: andl $-32, %edx 29; SSE-NEXT: movd %r9d, %xmm0 30; SSE-NEXT: movd %r8d, %xmm1 31; SSE-NEXT: xorl %ecx, %ecx 32; SSE-NEXT: pmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero 33; SSE-NEXT: pmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero 34; SSE-NEXT: .p2align 4, 0x90 35; SSE-NEXT: .LBB0_4: # %vector.body 36; SSE-NEXT: # =>This Inner Loop Header: Depth=1 37; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 38; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero 39; SSE-NEXT: movq {{.*#+}} xmm4 = mem[0],zero 40; SSE-NEXT: movq {{.*#+}} xmm5 = mem[0],zero 41; SSE-NEXT: pxor %xmm1, %xmm1 42; SSE-NEXT: pcmpeqb %xmm1, %xmm0 43; SSE-NEXT: pmovsxbd %xmm0, %xmm7 44; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 45; SSE-NEXT: pmovsxbd %xmm0, %xmm0 46; SSE-NEXT: pcmpeqb %xmm1, %xmm3 47; SSE-NEXT: pmovsxbd %xmm3, %xmm13 48; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] 49; SSE-NEXT: pmovsxbd %xmm3, %xmm6 50; SSE-NEXT: pcmpeqb %xmm1, %xmm4 51; SSE-NEXT: pmovsxbd %xmm4, %xmm11 52; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,2,3] 53; SSE-NEXT: pmovsxbd %xmm3, %xmm2 54; SSE-NEXT: pcmpeqb %xmm1, %xmm5 55; SSE-NEXT: pmovsxbd %xmm5, %xmm8 56; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,2,3] 57; SSE-NEXT: pmovsxbd %xmm3, %xmm9 58; SSE-NEXT: movdqu 16(%rdi,%rcx,4), %xmm3 59; SSE-NEXT: movdqa %xmm3, %xmm4 60; SSE-NEXT: pslld %xmm15, %xmm4 61; SSE-NEXT: pslld %xmm14, %xmm3 62; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm3 63; SSE-NEXT: movdqu (%rdi,%rcx,4), %xmm10 64; SSE-NEXT: movdqa %xmm10, %xmm5 65; SSE-NEXT: pslld %xmm15, %xmm5 66; SSE-NEXT: pslld %xmm14, %xmm10 67; SSE-NEXT: movdqa %xmm7, %xmm0 68; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm10 69; SSE-NEXT: movdqu 48(%rdi,%rcx,4), %xmm12 70; SSE-NEXT: movdqa %xmm12, %xmm5 71; SSE-NEXT: pslld %xmm15, %xmm5 72; SSE-NEXT: pslld %xmm14, %xmm12 73; SSE-NEXT: movdqa %xmm6, %xmm0 74; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm12 75; SSE-NEXT: movdqu 32(%rdi,%rcx,4), %xmm6 76; SSE-NEXT: movdqa %xmm6, %xmm5 77; SSE-NEXT: pslld %xmm15, %xmm5 78; SSE-NEXT: pslld %xmm14, %xmm6 79; SSE-NEXT: movdqa %xmm13, %xmm0 80; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm6 81; SSE-NEXT: movdqu 80(%rdi,%rcx,4), %xmm1 82; SSE-NEXT: movdqa %xmm1, %xmm5 83; SSE-NEXT: pslld %xmm15, %xmm5 84; SSE-NEXT: pslld %xmm14, %xmm1 85; SSE-NEXT: movdqa %xmm2, %xmm0 86; SSE-NEXT: blendvps %xmm0, %xmm5, %xmm1 87; SSE-NEXT: movdqu 64(%rdi,%rcx,4), %xmm5 88; SSE-NEXT: movdqa %xmm5, %xmm2 89; SSE-NEXT: pslld %xmm15, %xmm2 90; SSE-NEXT: pslld %xmm14, %xmm5 91; SSE-NEXT: movdqa %xmm11, %xmm0 92; SSE-NEXT: blendvps %xmm0, %xmm2, %xmm5 93; SSE-NEXT: movdqu 112(%rdi,%rcx,4), %xmm2 94; SSE-NEXT: movdqa %xmm2, %xmm4 95; SSE-NEXT: pslld %xmm15, %xmm4 96; SSE-NEXT: pslld %xmm14, %xmm2 97; SSE-NEXT: movdqa %xmm9, %xmm0 98; SSE-NEXT: blendvps %xmm0, %xmm4, %xmm2 99; SSE-NEXT: movdqu 96(%rdi,%rcx,4), %xmm4 100; SSE-NEXT: movdqa %xmm4, %xmm7 101; SSE-NEXT: pslld %xmm15, %xmm7 102; SSE-NEXT: pslld %xmm14, %xmm4 103; SSE-NEXT: movdqa %xmm8, %xmm0 104; SSE-NEXT: blendvps %xmm0, %xmm7, %xmm4 105; SSE-NEXT: movups %xmm10, (%rdi,%rcx,4) 106; SSE-NEXT: movups %xmm3, 16(%rdi,%rcx,4) 107; SSE-NEXT: movups %xmm6, 32(%rdi,%rcx,4) 108; SSE-NEXT: movups %xmm12, 48(%rdi,%rcx,4) 109; SSE-NEXT: movups %xmm5, 64(%rdi,%rcx,4) 110; SSE-NEXT: movups %xmm1, 80(%rdi,%rcx,4) 111; SSE-NEXT: movups %xmm4, 96(%rdi,%rcx,4) 112; SSE-NEXT: movups %xmm2, 112(%rdi,%rcx,4) 113; SSE-NEXT: addq $32, %rcx 114; SSE-NEXT: cmpq %rcx, %rdx 115; SSE-NEXT: jne .LBB0_4 116; SSE-NEXT: # %bb.5: # %middle.block 117; SSE-NEXT: cmpq %rax, %rdx 118; SSE-NEXT: jne .LBB0_6 119; SSE-NEXT: .LBB0_9: # %for.cond.cleanup 120; SSE-NEXT: retq 121; SSE-NEXT: .p2align 4, 0x90 122; SSE-NEXT: .LBB0_8: # %for.body 123; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1 124; SSE-NEXT: # kill: def $cl killed $cl killed $ecx 125; SSE-NEXT: shll %cl, (%rdi,%rdx,4) 126; SSE-NEXT: incq %rdx 127; SSE-NEXT: cmpq %rdx, %rax 128; SSE-NEXT: je .LBB0_9 129; SSE-NEXT: .LBB0_6: # %for.body 130; SSE-NEXT: # =>This Inner Loop Header: Depth=1 131; SSE-NEXT: cmpb $0, (%rsi,%rdx) 132; SSE-NEXT: movl %r9d, %ecx 133; SSE-NEXT: je .LBB0_8 134; SSE-NEXT: # %bb.7: # %for.body 135; SSE-NEXT: # in Loop: Header=BB0_6 Depth=1 136; SSE-NEXT: movl %r8d, %ecx 137; SSE-NEXT: jmp .LBB0_8 138; 139; AVX1-LABEL: vector_variable_shift_left_loop: 140; AVX1: # %bb.0: # %entry 141; AVX1-NEXT: subq $24, %rsp 142; AVX1-NEXT: testl %edx, %edx 143; AVX1-NEXT: jle .LBB0_9 144; AVX1-NEXT: # %bb.1: # %for.body.preheader 145; AVX1-NEXT: movl %ecx, %r9d 146; AVX1-NEXT: movl %edx, %eax 147; AVX1-NEXT: cmpl $31, %edx 148; AVX1-NEXT: ja .LBB0_3 149; AVX1-NEXT: # %bb.2: 150; AVX1-NEXT: xorl %edx, %edx 151; AVX1-NEXT: jmp .LBB0_6 152; AVX1-NEXT: .LBB0_3: # %vector.ph 153; AVX1-NEXT: movl %eax, %edx 154; AVX1-NEXT: andl $-32, %edx 155; AVX1-NEXT: vmovd %r9d, %xmm0 156; AVX1-NEXT: vmovd %r8d, %xmm1 157; AVX1-NEXT: xorl %ecx, %ecx 158; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 159; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 160; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 161; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 162; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 163; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 164; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero 165; AVX1-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 166; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm1[0],zero,xmm1[1],zero 167; AVX1-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 168; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm0[0],zero,xmm0[1],zero 169; AVX1-NEXT: vpxor %xmm11, %xmm11, %xmm11 170; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 171; AVX1-NEXT: .p2align 4, 0x90 172; AVX1-NEXT: .LBB0_4: # %vector.body 173; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 174; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 175; AVX1-NEXT: # xmm1 = mem[0],zero,mem[1],zero 176; AVX1-NEXT: vpmovzxdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 177; AVX1-NEXT: # xmm2 = mem[0],zero,mem[1],zero 178; AVX1-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 179; AVX1-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 180; AVX1-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero 181; AVX1-NEXT: vmovq {{.*#+}} xmm6 = mem[0],zero 182; AVX1-NEXT: vpcmpeqb %xmm11, %xmm3, %xmm3 183; AVX1-NEXT: vpmovsxbd %xmm3, %xmm7 184; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] 185; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 186; AVX1-NEXT: vpcmpeqb %xmm11, %xmm4, %xmm4 187; AVX1-NEXT: vpmovsxbd %xmm4, %xmm8 188; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] 189; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 190; AVX1-NEXT: vpcmpeqb %xmm11, %xmm5, %xmm5 191; AVX1-NEXT: vmovdqu (%rdi,%rcx,4), %xmm9 192; AVX1-NEXT: vpslld %xmm2, %xmm9, %xmm10 193; AVX1-NEXT: vpslld %xmm1, %xmm9, %xmm0 194; AVX1-NEXT: vblendvps %xmm7, %xmm10, %xmm0, %xmm9 195; AVX1-NEXT: vpmovsxbd %xmm5, %xmm7 196; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] 197; AVX1-NEXT: vpmovsxbd %xmm5, %xmm5 198; AVX1-NEXT: vpcmpeqb %xmm11, %xmm6, %xmm6 199; AVX1-NEXT: vmovdqu 16(%rdi,%rcx,4), %xmm0 200; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2 201; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 202; AVX1-NEXT: vpmovsxbd %xmm6, %xmm1 203; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3] 204; AVX1-NEXT: vpmovsxbd %xmm6, %xmm6 205; AVX1-NEXT: vblendvps %xmm3, %xmm2, %xmm0, %xmm10 206; AVX1-NEXT: vmovdqu 32(%rdi,%rcx,4), %xmm2 207; AVX1-NEXT: vpslld %xmm15, %xmm2, %xmm3 208; AVX1-NEXT: vpslld %xmm14, %xmm2, %xmm2 209; AVX1-NEXT: vblendvps %xmm8, %xmm3, %xmm2, %xmm8 210; AVX1-NEXT: vmovdqu 48(%rdi,%rcx,4), %xmm3 211; AVX1-NEXT: vpslld %xmm15, %xmm3, %xmm0 212; AVX1-NEXT: vpslld %xmm14, %xmm3, %xmm3 213; AVX1-NEXT: vblendvps %xmm4, %xmm0, %xmm3, %xmm0 214; AVX1-NEXT: vmovdqu 64(%rdi,%rcx,4), %xmm3 215; AVX1-NEXT: vpslld %xmm13, %xmm3, %xmm4 216; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 217; AVX1-NEXT: vpslld %xmm2, %xmm3, %xmm3 218; AVX1-NEXT: vblendvps %xmm7, %xmm4, %xmm3, %xmm3 219; AVX1-NEXT: vmovdqu 80(%rdi,%rcx,4), %xmm4 220; AVX1-NEXT: vpslld %xmm13, %xmm4, %xmm7 221; AVX1-NEXT: vpslld %xmm2, %xmm4, %xmm4 222; AVX1-NEXT: vblendvps %xmm5, %xmm7, %xmm4, %xmm4 223; AVX1-NEXT: vmovdqu 96(%rdi,%rcx,4), %xmm5 224; AVX1-NEXT: vpslld %xmm12, %xmm5, %xmm7 225; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 226; AVX1-NEXT: vpslld %xmm2, %xmm5, %xmm5 227; AVX1-NEXT: vblendvps %xmm1, %xmm7, %xmm5, %xmm1 228; AVX1-NEXT: vmovdqu 112(%rdi,%rcx,4), %xmm5 229; AVX1-NEXT: vpslld %xmm12, %xmm5, %xmm7 230; AVX1-NEXT: vpslld %xmm2, %xmm5, %xmm5 231; AVX1-NEXT: vblendvps %xmm6, %xmm7, %xmm5, %xmm5 232; AVX1-NEXT: vmovups %xmm9, (%rdi,%rcx,4) 233; AVX1-NEXT: vmovups %xmm10, 16(%rdi,%rcx,4) 234; AVX1-NEXT: vmovups %xmm8, 32(%rdi,%rcx,4) 235; AVX1-NEXT: vmovups %xmm0, 48(%rdi,%rcx,4) 236; AVX1-NEXT: vmovups %xmm3, 64(%rdi,%rcx,4) 237; AVX1-NEXT: vmovups %xmm4, 80(%rdi,%rcx,4) 238; AVX1-NEXT: vmovups %xmm1, 96(%rdi,%rcx,4) 239; AVX1-NEXT: vmovups %xmm5, 112(%rdi,%rcx,4) 240; AVX1-NEXT: addq $32, %rcx 241; AVX1-NEXT: cmpq %rcx, %rdx 242; AVX1-NEXT: jne .LBB0_4 243; AVX1-NEXT: # %bb.5: # %middle.block 244; AVX1-NEXT: cmpq %rax, %rdx 245; AVX1-NEXT: jne .LBB0_6 246; AVX1-NEXT: .LBB0_9: # %for.cond.cleanup 247; AVX1-NEXT: addq $24, %rsp 248; AVX1-NEXT: vzeroupper 249; AVX1-NEXT: retq 250; AVX1-NEXT: .p2align 4, 0x90 251; AVX1-NEXT: .LBB0_8: # %for.body 252; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1 253; AVX1-NEXT: # kill: def $cl killed $cl killed $ecx 254; AVX1-NEXT: shll %cl, (%rdi,%rdx,4) 255; AVX1-NEXT: incq %rdx 256; AVX1-NEXT: cmpq %rdx, %rax 257; AVX1-NEXT: je .LBB0_9 258; AVX1-NEXT: .LBB0_6: # %for.body 259; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 260; AVX1-NEXT: cmpb $0, (%rsi,%rdx) 261; AVX1-NEXT: movl %r9d, %ecx 262; AVX1-NEXT: je .LBB0_8 263; AVX1-NEXT: # %bb.7: # %for.body 264; AVX1-NEXT: # in Loop: Header=BB0_6 Depth=1 265; AVX1-NEXT: movl %r8d, %ecx 266; AVX1-NEXT: jmp .LBB0_8 267; 268; AVX2-LABEL: vector_variable_shift_left_loop: 269; AVX2: # %bb.0: # %entry 270; AVX2-NEXT: testl %edx, %edx 271; AVX2-NEXT: jle .LBB0_9 272; AVX2-NEXT: # %bb.1: # %for.body.preheader 273; AVX2-NEXT: movl %ecx, %r9d 274; AVX2-NEXT: movl %edx, %eax 275; AVX2-NEXT: cmpl $31, %edx 276; AVX2-NEXT: ja .LBB0_3 277; AVX2-NEXT: # %bb.2: 278; AVX2-NEXT: xorl %edx, %edx 279; AVX2-NEXT: jmp .LBB0_6 280; AVX2-NEXT: .LBB0_3: # %vector.ph 281; AVX2-NEXT: movl %eax, %edx 282; AVX2-NEXT: andl $-32, %edx 283; AVX2-NEXT: vmovd %r9d, %xmm0 284; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 285; AVX2-NEXT: vmovd %r8d, %xmm1 286; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 287; AVX2-NEXT: xorl %ecx, %ecx 288; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 289; AVX2-NEXT: .p2align 4, 0x90 290; AVX2-NEXT: .LBB0_4: # %vector.body 291; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 292; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 293; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 294; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 295; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 296; AVX2-NEXT: vpcmpeqd %ymm2, %ymm3, %ymm3 297; AVX2-NEXT: vblendvps %ymm3, %ymm0, %ymm1, %ymm3 298; AVX2-NEXT: vpcmpeqd %ymm2, %ymm4, %ymm4 299; AVX2-NEXT: vblendvps %ymm4, %ymm0, %ymm1, %ymm4 300; AVX2-NEXT: vpcmpeqd %ymm2, %ymm5, %ymm5 301; AVX2-NEXT: vblendvps %ymm5, %ymm0, %ymm1, %ymm5 302; AVX2-NEXT: vpcmpeqd %ymm2, %ymm6, %ymm6 303; AVX2-NEXT: vblendvps %ymm6, %ymm0, %ymm1, %ymm6 304; AVX2-NEXT: vmovdqu (%rdi,%rcx,4), %ymm7 305; AVX2-NEXT: vpsllvd %ymm3, %ymm7, %ymm3 306; AVX2-NEXT: vmovdqu 32(%rdi,%rcx,4), %ymm7 307; AVX2-NEXT: vpsllvd %ymm4, %ymm7, %ymm4 308; AVX2-NEXT: vmovdqu 64(%rdi,%rcx,4), %ymm7 309; AVX2-NEXT: vpsllvd %ymm5, %ymm7, %ymm5 310; AVX2-NEXT: vmovdqu 96(%rdi,%rcx,4), %ymm7 311; AVX2-NEXT: vpsllvd %ymm6, %ymm7, %ymm6 312; AVX2-NEXT: vmovdqu %ymm3, (%rdi,%rcx,4) 313; AVX2-NEXT: vmovdqu %ymm4, 32(%rdi,%rcx,4) 314; AVX2-NEXT: vmovdqu %ymm5, 64(%rdi,%rcx,4) 315; AVX2-NEXT: vmovdqu %ymm6, 96(%rdi,%rcx,4) 316; AVX2-NEXT: addq $32, %rcx 317; AVX2-NEXT: cmpq %rcx, %rdx 318; AVX2-NEXT: jne .LBB0_4 319; AVX2-NEXT: # %bb.5: # %middle.block 320; AVX2-NEXT: cmpq %rax, %rdx 321; AVX2-NEXT: jne .LBB0_6 322; AVX2-NEXT: .LBB0_9: # %for.cond.cleanup 323; AVX2-NEXT: vzeroupper 324; AVX2-NEXT: retq 325; AVX2-NEXT: .p2align 4, 0x90 326; AVX2-NEXT: .LBB0_8: # %for.body 327; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1 328; AVX2-NEXT: # kill: def $cl killed $cl killed $ecx 329; AVX2-NEXT: shll %cl, (%rdi,%rdx,4) 330; AVX2-NEXT: incq %rdx 331; AVX2-NEXT: cmpq %rdx, %rax 332; AVX2-NEXT: je .LBB0_9 333; AVX2-NEXT: .LBB0_6: # %for.body 334; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 335; AVX2-NEXT: cmpb $0, (%rsi,%rdx) 336; AVX2-NEXT: movl %r9d, %ecx 337; AVX2-NEXT: je .LBB0_8 338; AVX2-NEXT: # %bb.7: # %for.body 339; AVX2-NEXT: # in Loop: Header=BB0_6 Depth=1 340; AVX2-NEXT: movl %r8d, %ecx 341; AVX2-NEXT: jmp .LBB0_8 342entry: 343 %cmp12 = icmp sgt i32 %count, 0 344 br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup 345 346for.body.preheader: 347 %wide.trip.count = zext i32 %count to i64 348 %min.iters.check = icmp ult i32 %count, 32 349 br i1 %min.iters.check, label %for.body.preheader40, label %vector.ph 350 351for.body.preheader40: 352 %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 353 br label %for.body 354 355vector.ph: 356 %n.vec = and i64 %wide.trip.count, 4294967264 357 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %amt0, i32 0 358 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer 359 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %amt1, i32 0 360 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer 361 %broadcast.splatinsert24 = insertelement <8 x i32> undef, i32 %amt0, i32 0 362 %broadcast.splat25 = shufflevector <8 x i32> %broadcast.splatinsert24, <8 x i32> undef, <8 x i32> zeroinitializer 363 %broadcast.splatinsert26 = insertelement <8 x i32> undef, i32 %amt1, i32 0 364 %broadcast.splat27 = shufflevector <8 x i32> %broadcast.splatinsert26, <8 x i32> undef, <8 x i32> zeroinitializer 365 %broadcast.splatinsert28 = insertelement <8 x i32> undef, i32 %amt0, i32 0 366 %broadcast.splat29 = shufflevector <8 x i32> %broadcast.splatinsert28, <8 x i32> undef, <8 x i32> zeroinitializer 367 %broadcast.splatinsert30 = insertelement <8 x i32> undef, i32 %amt1, i32 0 368 %broadcast.splat31 = shufflevector <8 x i32> %broadcast.splatinsert30, <8 x i32> undef, <8 x i32> zeroinitializer 369 %broadcast.splatinsert32 = insertelement <8 x i32> undef, i32 %amt0, i32 0 370 %broadcast.splat33 = shufflevector <8 x i32> %broadcast.splatinsert32, <8 x i32> undef, <8 x i32> zeroinitializer 371 %broadcast.splatinsert34 = insertelement <8 x i32> undef, i32 %amt1, i32 0 372 %broadcast.splat35 = shufflevector <8 x i32> %broadcast.splatinsert34, <8 x i32> undef, <8 x i32> zeroinitializer 373 br label %vector.body 374 375vector.body: 376 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 377 %0 = getelementptr inbounds i8, i8* %control, i64 %index 378 %1 = bitcast i8* %0 to <8 x i8>* 379 %wide.load = load <8 x i8>, <8 x i8>* %1, align 1 380 %2 = getelementptr inbounds i8, i8* %0, i64 8 381 %3 = bitcast i8* %2 to <8 x i8>* 382 %wide.load17 = load <8 x i8>, <8 x i8>* %3, align 1 383 %4 = getelementptr inbounds i8, i8* %0, i64 16 384 %5 = bitcast i8* %4 to <8 x i8>* 385 %wide.load18 = load <8 x i8>, <8 x i8>* %5, align 1 386 %6 = getelementptr inbounds i8, i8* %0, i64 24 387 %7 = bitcast i8* %6 to <8 x i8>* 388 %wide.load19 = load <8 x i8>, <8 x i8>* %7, align 1 389 %8 = icmp eq <8 x i8> %wide.load, zeroinitializer 390 %9 = icmp eq <8 x i8> %wide.load17, zeroinitializer 391 %10 = icmp eq <8 x i8> %wide.load18, zeroinitializer 392 %11 = icmp eq <8 x i8> %wide.load19, zeroinitializer 393 %12 = select <8 x i1> %8, <8 x i32> %broadcast.splat21, <8 x i32> %broadcast.splat23 394 %13 = select <8 x i1> %9, <8 x i32> %broadcast.splat25, <8 x i32> %broadcast.splat27 395 %14 = select <8 x i1> %10, <8 x i32> %broadcast.splat29, <8 x i32> %broadcast.splat31 396 %15 = select <8 x i1> %11, <8 x i32> %broadcast.splat33, <8 x i32> %broadcast.splat35 397 %16 = getelementptr inbounds i32, i32* %arr, i64 %index 398 %17 = bitcast i32* %16 to <8 x i32>* 399 %wide.load36 = load <8 x i32>, <8 x i32>* %17, align 4 400 %18 = getelementptr inbounds i32, i32* %16, i64 8 401 %19 = bitcast i32* %18 to <8 x i32>* 402 %wide.load37 = load <8 x i32>, <8 x i32>* %19, align 4 403 %20 = getelementptr inbounds i32, i32* %16, i64 16 404 %21 = bitcast i32* %20 to <8 x i32>* 405 %wide.load38 = load <8 x i32>, <8 x i32>* %21, align 4 406 %22 = getelementptr inbounds i32, i32* %16, i64 24 407 %23 = bitcast i32* %22 to <8 x i32>* 408 %wide.load39 = load <8 x i32>, <8 x i32>* %23, align 4 409 %24 = shl <8 x i32> %wide.load36, %12 410 %25 = shl <8 x i32> %wide.load37, %13 411 %26 = shl <8 x i32> %wide.load38, %14 412 %27 = shl <8 x i32> %wide.load39, %15 413 %28 = bitcast i32* %16 to <8 x i32>* 414 store <8 x i32> %24, <8 x i32>* %28, align 4 415 %29 = bitcast i32* %18 to <8 x i32>* 416 store <8 x i32> %25, <8 x i32>* %29, align 4 417 %30 = bitcast i32* %20 to <8 x i32>* 418 store <8 x i32> %26, <8 x i32>* %30, align 4 419 %31 = bitcast i32* %22 to <8 x i32>* 420 store <8 x i32> %27, <8 x i32>* %31, align 4 421 %index.next = add i64 %index, 32 422 %32 = icmp eq i64 %index.next, %n.vec 423 br i1 %32, label %middle.block, label %vector.body 424 425middle.block: 426 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count 427 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader40 428 429for.cond.cleanup: 430 ret void 431 432for.body: 433 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader40 ] 434 %arrayidx = getelementptr inbounds i8, i8* %control, i64 %indvars.iv 435 %33 = load i8, i8* %arrayidx, align 1 436 %tobool = icmp eq i8 %33, 0 437 %cond = select i1 %tobool, i32 %amt0, i32 %amt1 438 %arrayidx2 = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv 439 %34 = load i32, i32* %arrayidx2, align 4 440 %shl = shl i32 %34, %cond 441 store i32 %shl, i32* %arrayidx2, align 4 442 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 443 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count 444 br i1 %exitcond, label %for.cond.cleanup, label %for.body 445} 446 447define void @vector_variable_shift_left_loop_simpler(i32* nocapture %arr, i8* nocapture readonly %control, i32 %count, i32 %amt0, i32 %amt1, i32 %x) nounwind { 448; SSE-LABEL: vector_variable_shift_left_loop_simpler: 449; SSE: # %bb.0: # %entry 450; SSE-NEXT: testl %edx, %edx 451; SSE-NEXT: jle .LBB1_3 452; SSE-NEXT: # %bb.1: # %vector.ph 453; SSE-NEXT: movl %edx, %eax 454; SSE-NEXT: andl $-4, %eax 455; SSE-NEXT: movd %ecx, %xmm0 456; SSE-NEXT: movd %r8d, %xmm2 457; SSE-NEXT: movd %r9d, %xmm3 458; SSE-NEXT: xorl %ecx, %ecx 459; SSE-NEXT: pslld $23, %xmm0 460; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216] 461; SSE-NEXT: paddd %xmm4, %xmm0 462; SSE-NEXT: cvttps2dq %xmm0, %xmm0 463; SSE-NEXT: pmulld %xmm3, %xmm0 464; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 465; SSE-NEXT: pslld $23, %xmm2 466; SSE-NEXT: paddd %xmm4, %xmm2 467; SSE-NEXT: cvttps2dq %xmm2, %xmm0 468; SSE-NEXT: pmulld %xmm3, %xmm0 469; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] 470; SSE-NEXT: pxor %xmm3, %xmm3 471; SSE-NEXT: .p2align 4, 0x90 472; SSE-NEXT: .LBB1_2: # %vector.body 473; SSE-NEXT: # =>This Inner Loop Header: Depth=1 474; SSE-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 475; SSE-NEXT: pcmpeqd %xmm3, %xmm0 476; SSE-NEXT: movdqa %xmm2, %xmm4 477; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm4 478; SSE-NEXT: movups %xmm4, (%rdi,%rcx,4) 479; SSE-NEXT: addq $4, %rcx 480; SSE-NEXT: cmpq %rcx, %rax 481; SSE-NEXT: jne .LBB1_2 482; SSE-NEXT: .LBB1_3: # %exit 483; SSE-NEXT: retq 484; 485; AVX1-LABEL: vector_variable_shift_left_loop_simpler: 486; AVX1: # %bb.0: # %entry 487; AVX1-NEXT: testl %edx, %edx 488; AVX1-NEXT: jle .LBB1_3 489; AVX1-NEXT: # %bb.1: # %vector.ph 490; AVX1-NEXT: movl %edx, %eax 491; AVX1-NEXT: andl $-4, %eax 492; AVX1-NEXT: vmovd %ecx, %xmm0 493; AVX1-NEXT: vmovd %r8d, %xmm1 494; AVX1-NEXT: vmovd %r9d, %xmm2 495; AVX1-NEXT: xorl %ecx, %ecx 496; AVX1-NEXT: vpslld $23, %xmm0, %xmm0 497; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] 498; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 499; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 500; AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 501; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 502; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 503; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 504; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 505; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 506; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] 507; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 508; AVX1-NEXT: .p2align 4, 0x90 509; AVX1-NEXT: .LBB1_2: # %vector.body 510; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 511; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 512; AVX1-NEXT: vpcmpeqd %xmm2, %xmm3, %xmm3 513; AVX1-NEXT: vblendvps %xmm3, %xmm0, %xmm1, %xmm3 514; AVX1-NEXT: vmovups %xmm3, (%rdi,%rcx,4) 515; AVX1-NEXT: addq $4, %rcx 516; AVX1-NEXT: cmpq %rcx, %rax 517; AVX1-NEXT: jne .LBB1_2 518; AVX1-NEXT: .LBB1_3: # %exit 519; AVX1-NEXT: retq 520; 521; AVX2-LABEL: vector_variable_shift_left_loop_simpler: 522; AVX2: # %bb.0: # %entry 523; AVX2-NEXT: testl %edx, %edx 524; AVX2-NEXT: jle .LBB1_3 525; AVX2-NEXT: # %bb.1: # %vector.ph 526; AVX2-NEXT: movl %edx, %eax 527; AVX2-NEXT: andl $-4, %eax 528; AVX2-NEXT: vmovd %ecx, %xmm0 529; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 530; AVX2-NEXT: vmovd %r8d, %xmm1 531; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 532; AVX2-NEXT: vmovd %r9d, %xmm2 533; AVX2-NEXT: vpbroadcastd %xmm2, %xmm2 534; AVX2-NEXT: xorl %ecx, %ecx 535; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 536; AVX2-NEXT: .p2align 4, 0x90 537; AVX2-NEXT: .LBB1_2: # %vector.body 538; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 539; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 540; AVX2-NEXT: vpcmpeqd %xmm3, %xmm4, %xmm4 541; AVX2-NEXT: vblendvps %xmm4, %xmm0, %xmm1, %xmm4 542; AVX2-NEXT: vpsllvd %xmm4, %xmm2, %xmm4 543; AVX2-NEXT: vmovdqu %xmm4, (%rdi,%rcx,4) 544; AVX2-NEXT: addq $4, %rcx 545; AVX2-NEXT: cmpq %rcx, %rax 546; AVX2-NEXT: jne .LBB1_2 547; AVX2-NEXT: .LBB1_3: # %exit 548; AVX2-NEXT: retq 549entry: 550 %cmp16 = icmp sgt i32 %count, 0 551 %wide.trip.count = zext i32 %count to i64 552 br i1 %cmp16, label %vector.ph, label %exit 553 554vector.ph: 555 %n.vec = and i64 %wide.trip.count, 4294967292 556 %splatinsert18 = insertelement <4 x i32> undef, i32 %amt0, i32 0 557 %splat1 = shufflevector <4 x i32> %splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer 558 %splatinsert20 = insertelement <4 x i32> undef, i32 %amt1, i32 0 559 %splat2 = shufflevector <4 x i32> %splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer 560 %splatinsert22 = insertelement <4 x i32> undef, i32 %x, i32 0 561 %splat3 = shufflevector <4 x i32> %splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer 562 br label %vector.body 563 564vector.body: 565 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 566 %0 = getelementptr inbounds i8, i8* %control, i64 %index 567 %1 = bitcast i8* %0 to <4 x i8>* 568 %wide.load = load <4 x i8>, <4 x i8>* %1, align 1 569 %2 = icmp eq <4 x i8> %wide.load, zeroinitializer 570 %3 = select <4 x i1> %2, <4 x i32> %splat1, <4 x i32> %splat2 571 %4 = shl <4 x i32> %splat3, %3 572 %5 = getelementptr inbounds i32, i32* %arr, i64 %index 573 %6 = bitcast i32* %5 to <4 x i32>* 574 store <4 x i32> %4, <4 x i32>* %6, align 4 575 %index.next = add i64 %index, 4 576 %7 = icmp eq i64 %index.next, %n.vec 577 br i1 %7, label %exit, label %vector.body 578 579exit: 580 ret void 581} 582