1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42 4; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 8; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 9; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 10; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512 11; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefixes=AVX,XOP 12 13define void @insert_v7i8_v2i16_2(<7 x i8> *%a0, <2 x i16> *%a1) nounwind { 14; SSE-LABEL: insert_v7i8_v2i16_2: 15; SSE: # %bb.0: 16; SSE-NEXT: movl (%rsi), %eax 17; SSE-NEXT: movd %eax, %xmm0 18; SSE-NEXT: movq (%rdi), %rcx 19; SSE-NEXT: movq %rcx, %xmm1 20; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 21; SSE-NEXT: shrq $48, %rcx 22; SSE-NEXT: movb %cl, 6(%rdi) 23; SSE-NEXT: shrl $16, %eax 24; SSE-NEXT: movw %ax, 4(%rdi) 25; SSE-NEXT: movd %xmm1, (%rdi) 26; SSE-NEXT: retq 27; 28; AVX-LABEL: insert_v7i8_v2i16_2: 29; AVX: # %bb.0: 30; AVX-NEXT: movl (%rsi), %eax 31; AVX-NEXT: vmovd %eax, %xmm0 32; AVX-NEXT: movq (%rdi), %rcx 33; AVX-NEXT: vmovq %rcx, %xmm1 34; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 35; AVX-NEXT: shrq $48, %rcx 36; AVX-NEXT: movb %cl, 6(%rdi) 37; AVX-NEXT: shrl $16, %eax 38; AVX-NEXT: movw %ax, 4(%rdi) 39; AVX-NEXT: vmovd %xmm0, (%rdi) 40; AVX-NEXT: retq 41; 42; AVX512-LABEL: insert_v7i8_v2i16_2: 43; AVX512: # %bb.0: 44; AVX512-NEXT: movl (%rsi), %eax 45; AVX512-NEXT: vmovd %eax, %xmm0 46; AVX512-NEXT: movq (%rdi), %rcx 47; AVX512-NEXT: vmovq %rcx, %xmm1 48; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 49; AVX512-NEXT: shrq $48, %rcx 50; AVX512-NEXT: movb %cl, 6(%rdi) 51; AVX512-NEXT: shrl $16, %eax 52; AVX512-NEXT: movw %ax, 4(%rdi) 53; AVX512-NEXT: vmovd %xmm0, (%rdi) 54; AVX512-NEXT: retq 55 %1 = load <2 x i16>, <2 x i16> *%a1 56 %2 = bitcast <2 x i16> %1 to <4 x i8> 57 %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <7 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef> 58 %4 = load <7 x i8>, <7 x i8> *%a0 59 %5 = shufflevector <7 x i8> %4, <7 x i8> %3, <7 x i32> <i32 0, i32 1, i32 7, i32 8, i32 9, i32 10, i32 6> 60 store <7 x i8> %5, <7 x i8>* %a0 61 ret void 62} 63 64%struct.Mat4 = type { %struct.storage } 65%struct.storage = type { [16 x float] } 66 67define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %struct.Mat4* nocapture dereferenceable(64)) { 68; SSE-LABEL: PR40815: 69; SSE: # %bb.0: 70; SSE-NEXT: movaps (%rdi), %xmm0 71; SSE-NEXT: movaps 16(%rdi), %xmm1 72; SSE-NEXT: movaps 32(%rdi), %xmm2 73; SSE-NEXT: movaps 48(%rdi), %xmm3 74; SSE-NEXT: movaps %xmm3, (%rsi) 75; SSE-NEXT: movaps %xmm2, 16(%rsi) 76; SSE-NEXT: movaps %xmm1, 32(%rsi) 77; SSE-NEXT: movaps %xmm0, 48(%rsi) 78; SSE-NEXT: retq 79; 80; AVX-LABEL: PR40815: 81; AVX: # %bb.0: 82; AVX-NEXT: vmovaps (%rdi), %xmm0 83; AVX-NEXT: vmovaps 16(%rdi), %xmm1 84; AVX-NEXT: vmovaps 32(%rdi), %xmm2 85; AVX-NEXT: vmovaps 48(%rdi), %xmm3 86; AVX-NEXT: vmovaps %xmm2, 16(%rsi) 87; AVX-NEXT: vmovaps %xmm3, (%rsi) 88; AVX-NEXT: vmovaps %xmm0, 48(%rsi) 89; AVX-NEXT: vmovaps %xmm1, 32(%rsi) 90; AVX-NEXT: retq 91; 92; AVX512-LABEL: PR40815: 93; AVX512: # %bb.0: 94; AVX512-NEXT: vmovaps 16(%rdi), %xmm0 95; AVX512-NEXT: vmovaps 48(%rdi), %xmm1 96; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0 97; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1 98; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 99; AVX512-NEXT: vmovups %zmm0, (%rsi) 100; AVX512-NEXT: vzeroupper 101; AVX512-NEXT: retq 102 %3 = bitcast %struct.Mat4* %0 to <16 x float>* 103 %4 = load <16 x float>, <16 x float>* %3, align 64 104 %5 = shufflevector <16 x float> %4, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 105 %6 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 4 106 %7 = bitcast <16 x float> %4 to <4 x i128> 107 %8 = extractelement <4 x i128> %7, i32 1 108 %9 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 8 109 %10 = bitcast <16 x float> %4 to <4 x i128> 110 %11 = extractelement <4 x i128> %10, i32 2 111 %12 = getelementptr inbounds %struct.Mat4, %struct.Mat4* %1, i64 0, i32 0, i32 0, i64 12 112 %13 = bitcast float* %12 to <4 x float>* 113 %14 = bitcast <16 x float> %4 to <4 x i128> 114 %15 = extractelement <4 x i128> %14, i32 3 115 %16 = bitcast %struct.Mat4* %1 to i128* 116 store i128 %15, i128* %16, align 16 117 %17 = bitcast float* %6 to i128* 118 store i128 %11, i128* %17, align 16 119 %18 = bitcast float* %9 to i128* 120 store i128 %8, i128* %18, align 16 121 store <4 x float> %5, <4 x float>* %13, align 16 122 ret void 123} 124 125define <16 x i32> @PR42819(<8 x i32>* %a0) { 126; SSE-LABEL: PR42819: 127; SSE: # %bb.0: 128; SSE-NEXT: movdqu (%rdi), %xmm3 129; SSE-NEXT: pslldq {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11] 130; SSE-NEXT: xorps %xmm0, %xmm0 131; SSE-NEXT: xorps %xmm1, %xmm1 132; SSE-NEXT: xorps %xmm2, %xmm2 133; SSE-NEXT: retq 134; 135; AVX-LABEL: PR42819: 136; AVX: # %bb.0: 137; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,0,1,2] 138; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 139; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 140; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] 141; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 142; AVX-NEXT: retq 143; 144; AVX512-LABEL: PR42819: 145; AVX512: # %bb.0: 146; AVX512-NEXT: vmovdqu (%rdi), %ymm0 147; AVX512-NEXT: movw $-8192, %ax # imm = 0xE000 148; AVX512-NEXT: kmovw %eax, %k1 149; AVX512-NEXT: vpexpandd %zmm0, %zmm0 {%k1} {z} 150; AVX512-NEXT: retq 151 %1 = load <8 x i32>, <8 x i32>* %a0, align 4 152 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 153 %3 = shufflevector <16 x i32> zeroinitializer, <16 x i32> %2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18> 154 ret <16 x i32> %3 155} 156 157@b = dso_local local_unnamed_addr global i32 0, align 4 158@c = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16 159@d = dso_local local_unnamed_addr global [49 x i32] zeroinitializer, align 16 160 161define void @PR42833() { 162; SSE2-LABEL: PR42833: 163; SSE2: # %bb.0: 164; SSE2-NEXT: movdqa c+144(%rip), %xmm1 165; SSE2-NEXT: movdqa c+128(%rip), %xmm0 166; SSE2-NEXT: movd %xmm0, %eax 167; SSE2-NEXT: addl b(%rip), %eax 168; SSE2-NEXT: movd %eax, %xmm2 169; SSE2-NEXT: movd %eax, %xmm3 170; SSE2-NEXT: paddd %xmm0, %xmm3 171; SSE2-NEXT: movdqa d+144(%rip), %xmm4 172; SSE2-NEXT: psubd %xmm1, %xmm4 173; SSE2-NEXT: paddd %xmm1, %xmm1 174; SSE2-NEXT: movdqa %xmm0, %xmm5 175; SSE2-NEXT: paddd %xmm0, %xmm5 176; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] 177; SSE2-NEXT: movdqa %xmm1, c+144(%rip) 178; SSE2-NEXT: movaps %xmm5, c+128(%rip) 179; SSE2-NEXT: movdqa c+160(%rip), %xmm1 180; SSE2-NEXT: movdqa c+176(%rip), %xmm3 181; SSE2-NEXT: movdqa d+160(%rip), %xmm5 182; SSE2-NEXT: movdqa d+176(%rip), %xmm6 183; SSE2-NEXT: movdqa d+128(%rip), %xmm7 184; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] 185; SSE2-NEXT: psubd %xmm0, %xmm7 186; SSE2-NEXT: psubd %xmm3, %xmm6 187; SSE2-NEXT: psubd %xmm1, %xmm5 188; SSE2-NEXT: movdqa %xmm5, d+160(%rip) 189; SSE2-NEXT: movdqa %xmm6, d+176(%rip) 190; SSE2-NEXT: movdqa %xmm4, d+144(%rip) 191; SSE2-NEXT: movdqa %xmm7, d+128(%rip) 192; SSE2-NEXT: paddd %xmm3, %xmm3 193; SSE2-NEXT: paddd %xmm1, %xmm1 194; SSE2-NEXT: movdqa %xmm1, c+160(%rip) 195; SSE2-NEXT: movdqa %xmm3, c+176(%rip) 196; SSE2-NEXT: retq 197; 198; SSE42-LABEL: PR42833: 199; SSE42: # %bb.0: 200; SSE42-NEXT: movdqa c+144(%rip), %xmm0 201; SSE42-NEXT: movdqa c+128(%rip), %xmm1 202; SSE42-NEXT: movd %xmm1, %eax 203; SSE42-NEXT: addl b(%rip), %eax 204; SSE42-NEXT: movd %eax, %xmm2 205; SSE42-NEXT: paddd %xmm1, %xmm2 206; SSE42-NEXT: movdqa d+144(%rip), %xmm3 207; SSE42-NEXT: psubd %xmm0, %xmm3 208; SSE42-NEXT: paddd %xmm0, %xmm0 209; SSE42-NEXT: movdqa %xmm1, %xmm4 210; SSE42-NEXT: paddd %xmm1, %xmm4 211; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] 212; SSE42-NEXT: movdqa %xmm0, c+144(%rip) 213; SSE42-NEXT: movdqa %xmm4, c+128(%rip) 214; SSE42-NEXT: movdqa c+160(%rip), %xmm0 215; SSE42-NEXT: movdqa c+176(%rip), %xmm2 216; SSE42-NEXT: movdqa d+160(%rip), %xmm4 217; SSE42-NEXT: movdqa d+176(%rip), %xmm5 218; SSE42-NEXT: movdqa d+128(%rip), %xmm6 219; SSE42-NEXT: pinsrd $0, %eax, %xmm1 220; SSE42-NEXT: psubd %xmm1, %xmm6 221; SSE42-NEXT: psubd %xmm2, %xmm5 222; SSE42-NEXT: psubd %xmm0, %xmm4 223; SSE42-NEXT: movdqa %xmm4, d+160(%rip) 224; SSE42-NEXT: movdqa %xmm5, d+176(%rip) 225; SSE42-NEXT: movdqa %xmm3, d+144(%rip) 226; SSE42-NEXT: movdqa %xmm6, d+128(%rip) 227; SSE42-NEXT: paddd %xmm2, %xmm2 228; SSE42-NEXT: paddd %xmm0, %xmm0 229; SSE42-NEXT: movdqa %xmm0, c+160(%rip) 230; SSE42-NEXT: movdqa %xmm2, c+176(%rip) 231; SSE42-NEXT: retq 232; 233; AVX1-LABEL: PR42833: 234; AVX1: # %bb.0: 235; AVX1-NEXT: vmovdqa c+128(%rip), %xmm0 236; AVX1-NEXT: vmovd %xmm0, %eax 237; AVX1-NEXT: addl b(%rip), %eax 238; AVX1-NEXT: vmovd %eax, %xmm1 239; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 240; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm2 241; AVX1-NEXT: vmovdqa c+144(%rip), %xmm3 242; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm3 243; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 244; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] 245; AVX1-NEXT: vmovdqa d+144(%rip), %xmm2 246; AVX1-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 247; AVX1-NEXT: vmovups %ymm1, c+128(%rip) 248; AVX1-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 249; AVX1-NEXT: vmovdqa d+128(%rip), %xmm1 250; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 251; AVX1-NEXT: vmovdqa d+176(%rip), %xmm1 252; AVX1-NEXT: vmovdqa c+176(%rip), %xmm3 253; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 254; AVX1-NEXT: vmovdqa d+160(%rip), %xmm4 255; AVX1-NEXT: vmovdqa c+160(%rip), %xmm5 256; AVX1-NEXT: vpsubd %xmm5, %xmm4, %xmm4 257; AVX1-NEXT: vmovdqa %xmm2, d+144(%rip) 258; AVX1-NEXT: vmovdqa %xmm4, d+160(%rip) 259; AVX1-NEXT: vmovdqa %xmm1, d+176(%rip) 260; AVX1-NEXT: vmovdqa %xmm0, d+128(%rip) 261; AVX1-NEXT: vpaddd %xmm3, %xmm3, %xmm0 262; AVX1-NEXT: vpaddd %xmm5, %xmm5, %xmm1 263; AVX1-NEXT: vmovdqa %xmm1, c+160(%rip) 264; AVX1-NEXT: vmovdqa %xmm0, c+176(%rip) 265; AVX1-NEXT: vzeroupper 266; AVX1-NEXT: retq 267; 268; AVX2-LABEL: PR42833: 269; AVX2: # %bb.0: 270; AVX2-NEXT: movl b(%rip), %eax 271; AVX2-NEXT: vmovdqu c+128(%rip), %ymm0 272; AVX2-NEXT: addl c+128(%rip), %eax 273; AVX2-NEXT: vmovd %eax, %xmm1 274; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm2 275; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 276; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3,4,5,6,7] 277; AVX2-NEXT: vmovdqu %ymm2, c+128(%rip) 278; AVX2-NEXT: vmovdqu c+160(%rip), %ymm2 279; AVX2-NEXT: vmovdqu d+160(%rip), %ymm3 280; AVX2-NEXT: vmovdqu d+128(%rip), %ymm4 281; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] 282; AVX2-NEXT: vpsubd %ymm0, %ymm4, %ymm0 283; AVX2-NEXT: vpsubd %ymm2, %ymm3, %ymm1 284; AVX2-NEXT: vmovdqu %ymm1, d+160(%rip) 285; AVX2-NEXT: vmovdqu %ymm0, d+128(%rip) 286; AVX2-NEXT: vpaddd %ymm2, %ymm2, %ymm0 287; AVX2-NEXT: vmovdqu %ymm0, c+160(%rip) 288; AVX2-NEXT: vzeroupper 289; AVX2-NEXT: retq 290; 291; AVX512-LABEL: PR42833: 292; AVX512: # %bb.0: 293; AVX512-NEXT: movl b(%rip), %eax 294; AVX512-NEXT: vmovdqu c+128(%rip), %ymm0 295; AVX512-NEXT: vmovdqu64 c+128(%rip), %zmm1 296; AVX512-NEXT: addl c+128(%rip), %eax 297; AVX512-NEXT: vmovd %eax, %xmm2 298; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm2 299; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 300; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7] 301; AVX512-NEXT: vmovdqa c+128(%rip), %xmm2 302; AVX512-NEXT: vmovdqu %ymm0, c+128(%rip) 303; AVX512-NEXT: vmovdqu c+160(%rip), %ymm0 304; AVX512-NEXT: vmovdqu64 d+128(%rip), %zmm3 305; AVX512-NEXT: vpinsrd $0, %eax, %xmm2, %xmm2 306; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1 307; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm1 308; AVX512-NEXT: vpsubd %zmm1, %zmm3, %zmm1 309; AVX512-NEXT: vmovdqu64 %zmm1, d+128(%rip) 310; AVX512-NEXT: vpaddd %ymm0, %ymm0, %ymm0 311; AVX512-NEXT: vmovdqu %ymm0, c+160(%rip) 312; AVX512-NEXT: vzeroupper 313; AVX512-NEXT: retq 314; 315; XOP-LABEL: PR42833: 316; XOP: # %bb.0: 317; XOP-NEXT: vmovdqa c+128(%rip), %xmm0 318; XOP-NEXT: vmovd %xmm0, %eax 319; XOP-NEXT: addl b(%rip), %eax 320; XOP-NEXT: vmovd %eax, %xmm1 321; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm1 322; XOP-NEXT: vpaddd %xmm0, %xmm0, %xmm2 323; XOP-NEXT: vmovdqa c+144(%rip), %xmm3 324; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm3 325; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 326; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7] 327; XOP-NEXT: vmovdqa d+144(%rip), %xmm2 328; XOP-NEXT: vpsubd c+144(%rip), %xmm2, %xmm2 329; XOP-NEXT: vmovups %ymm1, c+128(%rip) 330; XOP-NEXT: vpinsrd $0, %eax, %xmm0, %xmm0 331; XOP-NEXT: vmovdqa d+128(%rip), %xmm1 332; XOP-NEXT: vpsubd %xmm0, %xmm1, %xmm0 333; XOP-NEXT: vmovdqa d+176(%rip), %xmm1 334; XOP-NEXT: vmovdqa c+176(%rip), %xmm3 335; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 336; XOP-NEXT: vmovdqa d+160(%rip), %xmm4 337; XOP-NEXT: vmovdqa c+160(%rip), %xmm5 338; XOP-NEXT: vpsubd %xmm5, %xmm4, %xmm4 339; XOP-NEXT: vmovdqa %xmm2, d+144(%rip) 340; XOP-NEXT: vmovdqa %xmm4, d+160(%rip) 341; XOP-NEXT: vmovdqa %xmm1, d+176(%rip) 342; XOP-NEXT: vmovdqa %xmm0, d+128(%rip) 343; XOP-NEXT: vpaddd %xmm3, %xmm3, %xmm0 344; XOP-NEXT: vpaddd %xmm5, %xmm5, %xmm1 345; XOP-NEXT: vmovdqa %xmm1, c+160(%rip) 346; XOP-NEXT: vmovdqa %xmm0, c+176(%rip) 347; XOP-NEXT: vzeroupper 348; XOP-NEXT: retq 349 %1 = load i32, i32* @b, align 4 350 %2 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16 351 %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <16 x i32> <i32 undef, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 352 %4 = extractelement <8 x i32> %2, i32 0 353 %5 = add i32 %1, %4 354 %6 = insertelement <8 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %5, i32 0 355 %7 = add <8 x i32> %2, %6 356 %8 = shl <8 x i32> %2, %6 357 %9 = shufflevector <8 x i32> %7, <8 x i32> %8, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 358 store <8 x i32> %9, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 32) to <8 x i32>*), align 16 359 %10 = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16 360 %11 = shufflevector <8 x i32> %10, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 361 %12 = load <16 x i32>, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16 362 %13 = insertelement <16 x i32> %3, i32 %5, i32 0 363 %14 = shufflevector <16 x i32> %13, <16 x i32> %11, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 364 %15 = sub <16 x i32> %12, %14 365 store <16 x i32> %15, <16 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @d, i64 0, i64 32) to <16 x i32>*), align 16 366 %16 = shl <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 367 store <8 x i32> %16, <8 x i32>* bitcast (i32* getelementptr inbounds ([49 x i32], [49 x i32]* @c, i64 0, i64 40) to <8 x i32>*), align 16 368 ret void 369} 370