1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 8; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512 9; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP 10 11define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind { 12; SSE-LABEL: insert_v7i8_v2i16_2: 13; SSE: # %bb.0: 14; SSE-NEXT: movl (%rsi), %eax 15; SSE-NEXT: movd %eax, %xmm0 16; SSE-NEXT: movq (%rdi), %rcx 17; SSE-NEXT: movq %rcx, %xmm1 18; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 19; SSE-NEXT: shrq $48, %rcx 20; SSE-NEXT: movb %cl, 6(%rdi) 21; SSE-NEXT: shrl $16, %eax 22; SSE-NEXT: movw %ax, 4(%rdi) 23; SSE-NEXT: movd %xmm1, (%rdi) 24; SSE-NEXT: retq 25; 26; AVX-LABEL: insert_v7i8_v2i16_2: 27; AVX: # %bb.0: 28; AVX-NEXT: movl (%rsi), %eax 29; AVX-NEXT: vmovd %eax, %xmm0 30; AVX-NEXT: movq (%rdi), %rcx 31; AVX-NEXT: vmovq %rcx, %xmm1 32; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 33; AVX-NEXT: shrq $48, %rcx 34; AVX-NEXT: movb %cl, 6(%rdi) 35; AVX-NEXT: shrl $16, %eax 36; AVX-NEXT: movw %ax, 4(%rdi) 37; AVX-NEXT: vmovd %xmm0, (%rdi) 38; AVX-NEXT: retq 39; 40; AVX512-LABEL: insert_v7i8_v2i16_2: 41; AVX512: # %bb.0: 42; AVX512-NEXT: movl (%rsi), %eax 43; AVX512-NEXT: vmovd %eax, %xmm0 44; AVX512-NEXT: movq (%rdi), %rcx 45; AVX512-NEXT: vmovq %rcx, %xmm1 46; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 47; AVX512-NEXT: shrq $48, %rcx 48; AVX512-NEXT: movb %cl, 6(%rdi) 49; AVX512-NEXT: shrl $16, %eax 50; AVX512-NEXT: movw %ax, 4(%rdi) 51; AVX512-NEXT: vmovd %xmm0, (%rdi) 52; AVX512-NEXT: retq 53 %1 = load <2 x i16>, <2 x i16> *%a1 54 %2 = bitcast <2 x i16> %1 to <4 x i8> 55 %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef> 56 %4 = load <7 x i8>, <7 x i8> *%a0 57 %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6> 58 store <7 x i8> %5, <7 x i8>* %a0 59 ret void 60} 61 62%struct.Mat4 = type { %struct.storage } 63%struct.storage = type { [16 x float] } 64 65define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %struct.Mat4* nocapture dereferenceable(64)) { 66; SSE-LABEL: PR40815: 67; SSE: # %bb.0: 68; SSE-NEXT: movaps (%rdi), %xmm0 69; SSE-NEXT: movaps 16(%rdi), %xmm1 70; SSE-NEXT: movaps 32(%rdi), %xmm2 71; SSE-NEXT: movaps 48(%rdi), %xmm3 72; SSE-NEXT: movaps %xmm3, (%rsi) 73; SSE-NEXT: movaps %xmm2, 16(%rsi) 74; SSE-NEXT: movaps %xmm1, 32(%rsi) 75; SSE-NEXT: movaps %xmm0, 48(%rsi) 76; SSE-NEXT: retq 77; 78; AVX-LABEL: PR40815: 79; AVX: # %bb.0: 80; AVX-NEXT: vmovaps (%rdi), %xmm0 81; AVX-NEXT: vmovaps 16(%rdi), %xmm1 82; AVX-NEXT: vmovaps 32(%rdi), %xmm2 83; AVX-NEXT: vmovaps 48(%rdi), %xmm3 84; AVX-NEXT: vmovaps %xmm2, 16(%rsi) 85; AVX-NEXT: vmovaps %xmm3, (%rsi) 86; AVX-NEXT: vmovaps %xmm0, 48(%rsi) 87; AVX-NEXT: vmovaps %xmm1, 32(%rsi) 88; AVX-NEXT: retq 89; 90; AVX512-LABEL: PR40815: 91; AVX512: # %bb.0: 92; AVX512-NEXT: vmovaps 16(%rdi), %xmm0 93; AVX512-NEXT: vmovaps 48(%rdi), %xmm1 94; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 95; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 96; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 97; AVX512-NEXT: vmovups %zmm0, (%rsi) 98; AVX512-NEXT: vzeroupper 99; AVX512-NEXT: retq 100 %3 = bitcast %struct.Mat4* %0 to <16 x float>* 101 %4 = load <16 x float>, <16 x float>* %3, align 64 102 %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 103 %6 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 4 104 %7 = bitcast <16 x float> %4 to <4 x i128> 105 %8 = extractelement <4 x i128> %7, i32 1 106 %9 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 8 107 %10 = bitcast <16 x float> %4 to <4 x i128> 108 %11 = extractelement <4 x i128> %10, i32 2 109 %12 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 12 110 %13 = bitcast float* %12 to <4 x float>* 111 %14 = bitcast <16 x float> %4 to <4 x i128> 112 %15 = extractelement <4 x i128> %14, i32 3 113 %16 = bitcast %struct.Mat4* %1 to i128* 114 store i128 %15, i128* %16, align 16 115 %17 = bitcast float* %6 to i128* 116 store i128 %11, i128* %17, align 16 117 %18 = bitcast float* %9 to i128* 118 store i128 %8, i128* %18, align 16 119 store <4 x float> %5, <4 x float>* %13, align 16 120 ret void 121} 122 123define <16 x i32> @PR42819(<8 x i32>* %a0) { 124; SSE-LABEL: PR42819: 125; SSE: # %bb.0: 126; SSE-NEXT: movdqu (%rdi), %xmm3 127; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11] 128; SSE-NEXT: xorps %xmm0, %xmm0 129; SSE-NEXT: xorps %xmm1, %xmm1 130; SSE-NEXT: xorps %xmm2, %xmm2 131; SSE-NEXT: retq 132; 133; AVX-LABEL: PR42819: 134; AVX: # %bb.0: 135; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2] 136; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 137; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 138; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] 139; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 140; AVX-NEXT: retq 141; 142; AVX512-LABEL: PR42819: 143; AVX512: # %bb.0: 144; AVX512-NEXT: vmovdqu (%rdi), %ymm0 145; AVX512-NEXT: movw $-8192, %ax # imm = 0xE000 146; AVX512-NEXT: kmovw %eax, %k1 147; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 148; AVX512-NEXT: retq 149 %1 = load <8 x i32>, <8 x i32>* %a0, align 4 150 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 151 %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> 152 ret <16 x i32> %3 153} 154 155@b = dso_local local_unnamed_addr global i32 0, align 4 156@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16 157@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16 158 159define void @PR42833() { 160; SSE2-LABEL: PR42833: 161; SSE2: # %bb.0: 162; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm1 163; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm0 164; SSE2-NEXT: movd %xmm0, %eax 165; SSE2-NEXT: addl {{.*}}(%rip), %eax 166; SSE2-NEXT: movd %eax, %xmm2 167; SSE2-NEXT: movd %eax, %xmm3 168; SSE2-NEXT: paddd %xmm0, %xmm3 169; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm4 170; SSE2-NEXT: psubd %xmm1, %xmm4 171; SSE2-NEXT: paddd %xmm1, %xmm1 172; SSE2-NEXT: movdqa %xmm0, %xmm5 173; SSE2-NEXT: paddd %xmm0, %xmm5 174; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 175; SSE2-NEXT: movdqa %xmm1, c+{{.*}}(%rip) 176; SSE2-NEXT: movaps %xmm5, c+{{.*}}(%rip) 177; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm1 178; SSE2-NEXT: movdqa c+{{.*}}(%rip), %xmm3 179; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm5 180; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm6 181; SSE2-NEXT: movdqa d+{{.*}}(%rip), %xmm7 182; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 183; SSE2-NEXT: psubd %xmm0, %xmm7 184; SSE2-NEXT: psubd %xmm3, %xmm6 185; SSE2-NEXT: psubd %xmm1, %xmm5 186; SSE2-NEXT: movdqa %xmm5, d+{{.*}}(%rip) 187; SSE2-NEXT: movdqa %xmm6, d+{{.*}}(%rip) 188; SSE2-NEXT: movdqa %xmm4, d+{{.*}}(%rip) 189; SSE2-NEXT: movdqa %xmm7, d+{{.*}}(%rip) 190; SSE2-NEXT: paddd %xmm3, %xmm3 191; SSE2-NEXT: paddd %xmm1, %xmm1 192; SSE2-NEXT: movdqa %xmm1, c+{{.*}}(%rip) 193; SSE2-NEXT: movdqa %xmm3, c+{{.*}}(%rip) 194; SSE2-NEXT: retq 195; 196; SSE42-LABEL: PR42833: 197; SSE42: # %bb.0: 198; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm0 199; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm1 200; SSE42-NEXT: movd %xmm1, %eax 201; SSE42-NEXT: addl {{.*}}(%rip), %eax 202; SSE42-NEXT: movd %eax, %xmm2 203; SSE42-NEXT: paddd %xmm1, %xmm2 204; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm3 205; SSE42-NEXT: psubd %xmm0, %xmm3 206; SSE42-NEXT: paddd %xmm0, %xmm0 207; SSE42-NEXT: movdqa %xmm1, %xmm4 208; SSE42-NEXT: paddd %xmm1, %xmm4 209; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] 210; SSE42-NEXT: movdqa %xmm0, c+{{.*}}(%rip) 211; SSE42-NEXT: movdqa %xmm4, c+{{.*}}(%rip) 212; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm0 213; SSE42-NEXT: movdqa c+{{.*}}(%rip), %xmm2 214; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm4 215; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm5 216; SSE42-NEXT: movdqa d+{{.*}}(%rip), %xmm6 217; SSE42-NEXT: pinsrd $0, %eax, %xmm1 218; SSE42-NEXT: psubd %xmm1, %xmm6 219; SSE42-NEXT: psubd %xmm2, %xmm5 220; SSE42-NEXT: psubd %xmm0, %xmm4 221; SSE42-NEXT: movdqa %xmm4, d+{{.*}}(%rip) 222; SSE42-NEXT: movdqa %xmm5, d+{{.*}}(%rip) 223; SSE42-NEXT: movdqa %xmm3, d+{{.*}}(%rip) 224; SSE42-NEXT: movdqa %xmm6, d+{{.*}}(%rip) 225; SSE42-NEXT: paddd %xmm2, %xmm2 226; SSE42-NEXT: paddd %xmm0, %xmm0 227; SSE42-NEXT: movdqa %xmm0, c+{{.*}}(%rip) 228; SSE42-NEXT: movdqa %xmm2, c+{{.*}}(%rip) 229; SSE42-NEXT: retq 230; 231; AVX1-LABEL: PR42833: 232; AVX1: # %bb.0: 233; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0 234; AVX1-NEXT: vmovd %xmm0, %eax 235; AVX1-NEXT: addl {{.*}}(%rip), %eax 236; AVX1-NEXT: vmovd %eax, %xmm1 237; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 238; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 239; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3 240; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 241; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 242; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] 243; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm2 244; AVX1-NEXT: vpsubd c+{{.*}}(%rip), %xmm2, %xmm2 245; AVX1-NEXT: vmovups %ymm1, c+{{.*}}(%rip) 246; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 247; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1 248; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 249; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1 250; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3 251; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 252; AVX1-NEXT: vmovdqa d+{{.*}}(%rip), %xmm4 253; AVX1-NEXT: vmovdqa c+{{.*}}(%rip), %xmm5 254; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 255; AVX1-NEXT: vmovdqa %xmm2, d+{{.*}}(%rip) 256; AVX1-NEXT: vmovdqa %xmm4, d+{{.*}}(%rip) 257; AVX1-NEXT: vmovdqa %xmm1, d+{{.*}}(%rip) 258; AVX1-NEXT: vmovdqa %xmm0, d+{{.*}}(%rip) 259; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 260; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 261; AVX1-NEXT: vmovdqa %xmm1, c+{{.*}}(%rip) 262; AVX1-NEXT: vmovdqa %xmm0, c+{{.*}}(%rip) 263; AVX1-NEXT: vzeroupper 264; AVX1-NEXT: retq 265; 266; AVX2-LABEL: PR42833: 267; AVX2: # %bb.0: 268; AVX2-NEXT: movl {{.*}}(%rip), %eax 269; AVX2-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0 270; AVX2-NEXT: addl c+{{.*}}(%rip), %eax 271; AVX2-NEXT: vmovd %eax, %xmm1 272; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 273; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 274; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7] 275; AVX2-NEXT: vmovdqu %ymm2, c+{{.*}}(%rip) 276; AVX2-NEXT: vmovdqu c+{{.*}}(%rip), %ymm2 277; AVX2-NEXT: vmovdqu d+{{.*}}(%rip), %ymm3 278; AVX2-NEXT: vmovdqu d+{{.*}}(%rip), %ymm4 279; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] 280; AVX2-NEXT: vpsubd %ymm0, %ymm4, %ymm0 281; AVX2-NEXT: vpsubd %ymm2, %ymm3, %ymm1 282; AVX2-NEXT: vmovdqu %ymm1, d+{{.*}}(%rip) 283; AVX2-NEXT: vmovdqu %ymm0, d+{{.*}}(%rip) 284; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm0 285; AVX2-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip) 286; AVX2-NEXT: vzeroupper 287; AVX2-NEXT: retq 288; 289; AVX512-LABEL: PR42833: 290; AVX512: # %bb.0: 291; AVX512-NEXT: movl {{.*}}(%rip), %eax 292; AVX512-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0 293; AVX512-NEXT: vmovdqu64 c+{{.*}}(%rip), %zmm1 294; AVX512-NEXT: addl c+{{.*}}(%rip), %eax 295; AVX512-NEXT: vmovd %eax, %xmm2 296; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2 297; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 298; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] 299; AVX512-NEXT: vmovdqa c+{{.*}}(%rip), %xmm2 300; AVX512-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip) 301; AVX512-NEXT: vmovdqu c+{{.*}}(%rip), %ymm0 302; AVX512-NEXT: vmovdqu64 d+{{.*}}(%rip), %zmm3 303; AVX512-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2 304; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 305; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 306; AVX512-NEXT: vpsubd %zmm1, %zmm3, %zmm1 307; AVX512-NEXT: vmovdqu64 %zmm1, d+{{.*}}(%rip) 308; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 309; AVX512-NEXT: vmovdqu %ymm0, c+{{.*}}(%rip) 310; AVX512-NEXT: vzeroupper 311; AVX512-NEXT: retq 312; 313; XOP-LABEL: PR42833: 314; XOP: # %bb.0: 315; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm0 316; XOP-NEXT: vmovd %xmm0, %eax 317; XOP-NEXT: addl {{.*}}(%rip), %eax 318; XOP-NEXT: vmovd %eax, %xmm1 319; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 320; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 321; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3 322; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 323; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 324; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] 325; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm2 326; XOP-NEXT: vpsubd c+{{.*}}(%rip), %xmm2, %xmm2 327; XOP-NEXT: vmovups %ymm1, c+{{.*}}(%rip) 328; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 329; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1 330; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 331; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm1 332; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm3 333; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 334; XOP-NEXT: vmovdqa d+{{.*}}(%rip), %xmm4 335; XOP-NEXT: vmovdqa c+{{.*}}(%rip), %xmm5 336; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 337; XOP-NEXT: vmovdqa %xmm2, d+{{.*}}(%rip) 338; XOP-NEXT: vmovdqa %xmm4, d+{{.*}}(%rip) 339; XOP-NEXT: vmovdqa %xmm1, d+{{.*}}(%rip) 340; XOP-NEXT: vmovdqa %xmm0, d+{{.*}}(%rip) 341; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 342; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 343; XOP-NEXT: vmovdqa %xmm1, c+{{.*}}(%rip) 344; XOP-NEXT: vmovdqa %xmm0, c+{{.*}}(%rip) 345; XOP-NEXT: vzeroupper 346; XOP-NEXT: retq 347 %1 = load i32, i32* @b, align 4 348 %2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16 349 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 350 %4 = extractelement <8 x i32> %2, i32 0 351 %5 = add i32 %1, %4 352 %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0 353 %7 = add <8 x i32> %2, %6 354 %8 = shl <8 x i32> %2, %6 355 %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 356 store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16 357 %10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16 358 %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 359 %12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16 360 %13 = insertelement <16 x i32> %3, i32 %5, i32 0 361 %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 362 %15 = sub <16 x i32> %12, %14 363 store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16 364 %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 365 store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16 366 ret void 367} 368