1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 7; 8; Verify that the DAG combiner correctly folds bitwise operations across 9; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 10; basic and always-safe patterns. Also test that the DAG combiner will combine 11; target-specific shuffle instructions where reasonable. 12 13target triple = "x86_64-unknown-unknown" 14 15declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 16declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 17declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 18 19define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 20; ALL-LABEL: combine_pshufd1: 21; ALL: # BB#0: # %entry 22; ALL-NEXT: retq 23entry: 24 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 25 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 26 ret <4 x i32> %c 27} 28 29define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 30; ALL-LABEL: combine_pshufd2: 31; ALL: # BB#0: # %entry 32; ALL-NEXT: retq 33entry: 34 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 35 %b.cast = bitcast <4 x i32> %b to <8 x i16> 36 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 37 %c.cast = bitcast <8 x i16> %c to <4 x i32> 38 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 39 ret <4 x i32> %d 40} 41 42define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 43; ALL-LABEL: combine_pshufd3: 44; ALL: # BB#0: # %entry 45; ALL-NEXT: retq 46entry: 47 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 48 %b.cast = bitcast <4 x i32> %b to <8 x i16> 49 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 50 %c.cast = bitcast <8 x i16> %c to <4 x i32> 51 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 52 ret <4 x i32> %d 53} 54 55define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 56; SSE-LABEL: combine_pshufd4: 57; SSE: # BB#0: # %entry 58; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 59; SSE-NEXT: retq 60; 61; AVX-LABEL: combine_pshufd4: 62; AVX: # BB#0: # %entry 63; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 64; AVX-NEXT: retq 65entry: 66 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 67 %b.cast = bitcast <4 x i32> %b to <8 x i16> 68 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 69 %c.cast = bitcast <8 x i16> %c to <4 x i32> 70 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 71 ret <4 x i32> %d 72} 73 74define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 75; SSE-LABEL: combine_pshufd5: 76; SSE: # BB#0: # %entry 77; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 78; SSE-NEXT: retq 79; 80; AVX-LABEL: combine_pshufd5: 81; AVX: # BB#0: # %entry 82; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 83; AVX-NEXT: retq 84entry: 85 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 86 %b.cast = bitcast <4 x i32> %b to <8 x i16> 87 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 88 %c.cast = bitcast <8 x i16> %c to <4 x i32> 89 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 90 ret <4 x i32> %d 91} 92 93define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 94; SSE-LABEL: combine_pshufd6: 95; SSE: # BB#0: # %entry 96; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 97; SSE-NEXT: retq 98; 99; AVX1-LABEL: combine_pshufd6: 100; AVX1: # BB#0: # %entry 101; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 102; AVX1-NEXT: retq 103; 104; AVX2-LABEL: combine_pshufd6: 105; AVX2: # BB#0: # %entry 106; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 107; AVX2-NEXT: retq 108entry: 109 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 110 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 111 ret <4 x i32> %c 112} 113 114define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 115; ALL-LABEL: combine_pshuflw1: 116; ALL: # BB#0: # %entry 117; ALL-NEXT: retq 118entry: 119 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 120 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 121 ret <8 x i16> %c 122} 123 124define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 125; ALL-LABEL: combine_pshuflw2: 126; ALL: # BB#0: # %entry 127; ALL-NEXT: retq 128entry: 129 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 130 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 131 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 132 ret <8 x i16> %d 133} 134 135define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 136; SSE-LABEL: combine_pshuflw3: 137; SSE: # BB#0: # %entry 138; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 139; SSE-NEXT: retq 140; 141; AVX-LABEL: combine_pshuflw3: 142; AVX: # BB#0: # %entry 143; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 144; AVX-NEXT: retq 145entry: 146 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 147 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 148 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 149 ret <8 x i16> %d 150} 151 152define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 153; SSE-LABEL: combine_pshufhw1: 154; SSE: # BB#0: # %entry 155; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 156; SSE-NEXT: retq 157; 158; AVX-LABEL: combine_pshufhw1: 159; AVX: # BB#0: # %entry 160; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 161; AVX-NEXT: retq 162entry: 163 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 164 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 165 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 166 ret <8 x i16> %d 167} 168 169define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 170; SSE-LABEL: combine_bitwise_ops_test1: 171; SSE: # BB#0: 172; SSE-NEXT: pand %xmm1, %xmm0 173; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 174; SSE-NEXT: retq 175; 176; AVX-LABEL: combine_bitwise_ops_test1: 177; AVX: # BB#0: 178; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 179; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 180; AVX-NEXT: retq 181 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 182 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 183 %and = and <4 x i32> %shuf1, %shuf2 184 ret <4 x i32> %and 185} 186 187define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 188; SSE-LABEL: combine_bitwise_ops_test2: 189; SSE: # BB#0: 190; SSE-NEXT: por %xmm1, %xmm0 191; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 192; SSE-NEXT: retq 193; 194; AVX-LABEL: combine_bitwise_ops_test2: 195; AVX: # BB#0: 196; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 197; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 198; AVX-NEXT: retq 199 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 200 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 201 %or = or <4 x i32> %shuf1, %shuf2 202 ret <4 x i32> %or 203} 204 205define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 206; SSE-LABEL: combine_bitwise_ops_test3: 207; SSE: # BB#0: 208; SSE-NEXT: pxor %xmm1, %xmm0 209; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 210; SSE-NEXT: retq 211; 212; AVX-LABEL: combine_bitwise_ops_test3: 213; AVX: # BB#0: 214; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 215; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 216; AVX-NEXT: retq 217 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 218 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 219 %xor = xor <4 x i32> %shuf1, %shuf2 220 ret <4 x i32> %xor 221} 222 223define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 224; SSE-LABEL: combine_bitwise_ops_test4: 225; SSE: # BB#0: 226; SSE-NEXT: pand %xmm1, %xmm0 227; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 228; SSE-NEXT: retq 229; 230; AVX-LABEL: combine_bitwise_ops_test4: 231; AVX: # BB#0: 232; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 233; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 234; AVX-NEXT: retq 235 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 236 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 237 %and = and <4 x i32> %shuf1, %shuf2 238 ret <4 x i32> %and 239} 240 241define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 242; SSE-LABEL: combine_bitwise_ops_test5: 243; SSE: # BB#0: 244; SSE-NEXT: por %xmm1, %xmm0 245; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 246; SSE-NEXT: retq 247; 248; AVX-LABEL: combine_bitwise_ops_test5: 249; AVX: # BB#0: 250; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 251; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 252; AVX-NEXT: retq 253 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 254 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 255 %or = or <4 x i32> %shuf1, %shuf2 256 ret <4 x i32> %or 257} 258 259define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 260; SSE-LABEL: combine_bitwise_ops_test6: 261; SSE: # BB#0: 262; SSE-NEXT: pxor %xmm1, %xmm0 263; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 264; SSE-NEXT: retq 265; 266; AVX-LABEL: combine_bitwise_ops_test6: 267; AVX: # BB#0: 268; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 269; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 270; AVX-NEXT: retq 271 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 272 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 273 %xor = xor <4 x i32> %shuf1, %shuf2 274 ret <4 x i32> %xor 275} 276 277 278; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 279; are not performing a swizzle operations. 280 281define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 282; SSE2-LABEL: combine_bitwise_ops_test1b: 283; SSE2: # BB#0: 284; SSE2-NEXT: pand %xmm1, %xmm0 285; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 286; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 287; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 288; SSE2-NEXT: retq 289; 290; SSSE3-LABEL: combine_bitwise_ops_test1b: 291; SSSE3: # BB#0: 292; SSSE3-NEXT: pand %xmm1, %xmm0 293; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 294; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 295; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 296; SSSE3-NEXT: retq 297; 298; SSE41-LABEL: combine_bitwise_ops_test1b: 299; SSE41: # BB#0: 300; SSE41-NEXT: pand %xmm1, %xmm0 301; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 302; SSE41-NEXT: retq 303; 304; AVX1-LABEL: combine_bitwise_ops_test1b: 305; AVX1: # BB#0: 306; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 307; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 308; AVX1-NEXT: retq 309; 310; AVX2-LABEL: combine_bitwise_ops_test1b: 311; AVX2: # BB#0: 312; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 313; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 314; AVX2-NEXT: retq 315 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 316 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 317 %and = and <4 x i32> %shuf1, %shuf2 318 ret <4 x i32> %and 319} 320 321define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 322; SSE2-LABEL: combine_bitwise_ops_test2b: 323; SSE2: # BB#0: 324; SSE2-NEXT: por %xmm1, %xmm0 325; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 326; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 327; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 328; SSE2-NEXT: retq 329; 330; SSSE3-LABEL: combine_bitwise_ops_test2b: 331; SSSE3: # BB#0: 332; SSSE3-NEXT: por %xmm1, %xmm0 333; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 334; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 335; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 336; SSSE3-NEXT: retq 337; 338; SSE41-LABEL: combine_bitwise_ops_test2b: 339; SSE41: # BB#0: 340; SSE41-NEXT: por %xmm1, %xmm0 341; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 342; SSE41-NEXT: retq 343; 344; AVX1-LABEL: combine_bitwise_ops_test2b: 345; AVX1: # BB#0: 346; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 347; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 348; AVX1-NEXT: retq 349; 350; AVX2-LABEL: combine_bitwise_ops_test2b: 351; AVX2: # BB#0: 352; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 353; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 354; AVX2-NEXT: retq 355 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 356 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 357 %or = or <4 x i32> %shuf1, %shuf2 358 ret <4 x i32> %or 359} 360 361define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 362; SSE2-LABEL: combine_bitwise_ops_test3b: 363; SSE2: # BB#0: 364; SSE2-NEXT: xorps %xmm1, %xmm0 365; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 366; SSE2-NEXT: retq 367; 368; SSSE3-LABEL: combine_bitwise_ops_test3b: 369; SSSE3: # BB#0: 370; SSSE3-NEXT: xorps %xmm1, %xmm0 371; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 372; SSSE3-NEXT: retq 373; 374; SSE41-LABEL: combine_bitwise_ops_test3b: 375; SSE41: # BB#0: 376; SSE41-NEXT: pxor %xmm1, %xmm0 377; SSE41-NEXT: pxor %xmm1, %xmm1 378; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 379; SSE41-NEXT: retq 380; 381; AVX1-LABEL: combine_bitwise_ops_test3b: 382; AVX1: # BB#0: 383; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 384; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 385; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 386; AVX1-NEXT: retq 387; 388; AVX2-LABEL: combine_bitwise_ops_test3b: 389; AVX2: # BB#0: 390; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 391; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 392; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 393; AVX2-NEXT: retq 394 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 395 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 396 %xor = xor <4 x i32> %shuf1, %shuf2 397 ret <4 x i32> %xor 398} 399 400define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 401; SSE2-LABEL: combine_bitwise_ops_test4b: 402; SSE2: # BB#0: 403; SSE2-NEXT: pand %xmm1, %xmm0 404; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 405; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 406; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 407; SSE2-NEXT: retq 408; 409; SSSE3-LABEL: combine_bitwise_ops_test4b: 410; SSSE3: # BB#0: 411; SSSE3-NEXT: pand %xmm1, %xmm0 412; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 413; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 414; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 415; SSSE3-NEXT: retq 416; 417; SSE41-LABEL: combine_bitwise_ops_test4b: 418; SSE41: # BB#0: 419; SSE41-NEXT: pand %xmm1, %xmm0 420; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 421; SSE41-NEXT: retq 422; 423; AVX1-LABEL: combine_bitwise_ops_test4b: 424; AVX1: # BB#0: 425; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 426; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 427; AVX1-NEXT: retq 428; 429; AVX2-LABEL: combine_bitwise_ops_test4b: 430; AVX2: # BB#0: 431; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 432; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 433; AVX2-NEXT: retq 434 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 435 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 436 %and = and <4 x i32> %shuf1, %shuf2 437 ret <4 x i32> %and 438} 439 440define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 441; SSE2-LABEL: combine_bitwise_ops_test5b: 442; SSE2: # BB#0: 443; SSE2-NEXT: por %xmm1, %xmm0 444; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 445; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 446; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 447; SSE2-NEXT: retq 448; 449; SSSE3-LABEL: combine_bitwise_ops_test5b: 450; SSSE3: # BB#0: 451; SSSE3-NEXT: por %xmm1, %xmm0 452; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 453; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 454; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 455; SSSE3-NEXT: retq 456; 457; SSE41-LABEL: combine_bitwise_ops_test5b: 458; SSE41: # BB#0: 459; SSE41-NEXT: por %xmm1, %xmm0 460; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 461; SSE41-NEXT: retq 462; 463; AVX1-LABEL: combine_bitwise_ops_test5b: 464; AVX1: # BB#0: 465; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 466; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] 467; AVX1-NEXT: retq 468; 469; AVX2-LABEL: combine_bitwise_ops_test5b: 470; AVX2: # BB#0: 471; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 472; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 473; AVX2-NEXT: retq 474 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 475 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 476 %or = or <4 x i32> %shuf1, %shuf2 477 ret <4 x i32> %or 478} 479 480define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 481; SSE2-LABEL: combine_bitwise_ops_test6b: 482; SSE2: # BB#0: 483; SSE2-NEXT: xorps %xmm1, %xmm0 484; SSE2-NEXT: andps {{.*}}(%rip), %xmm0 485; SSE2-NEXT: retq 486; 487; SSSE3-LABEL: combine_bitwise_ops_test6b: 488; SSSE3: # BB#0: 489; SSSE3-NEXT: xorps %xmm1, %xmm0 490; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0 491; SSSE3-NEXT: retq 492; 493; SSE41-LABEL: combine_bitwise_ops_test6b: 494; SSE41: # BB#0: 495; SSE41-NEXT: pxor %xmm1, %xmm0 496; SSE41-NEXT: pxor %xmm1, %xmm1 497; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 498; SSE41-NEXT: retq 499; 500; AVX1-LABEL: combine_bitwise_ops_test6b: 501; AVX1: # BB#0: 502; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 503; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 504; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] 505; AVX1-NEXT: retq 506; 507; AVX2-LABEL: combine_bitwise_ops_test6b: 508; AVX2: # BB#0: 509; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 510; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 511; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 512; AVX2-NEXT: retq 513 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 514 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 515 %xor = xor <4 x i32> %shuf1, %shuf2 516 ret <4 x i32> %xor 517} 518 519define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 520; SSE-LABEL: combine_bitwise_ops_test1c: 521; SSE: # BB#0: 522; SSE-NEXT: andps %xmm1, %xmm0 523; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 524; SSE-NEXT: retq 525; 526; AVX-LABEL: combine_bitwise_ops_test1c: 527; AVX: # BB#0: 528; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 529; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 530; AVX-NEXT: retq 531 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 532 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 533 %and = and <4 x i32> %shuf1, %shuf2 534 ret <4 x i32> %and 535} 536 537define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 538; SSE-LABEL: combine_bitwise_ops_test2c: 539; SSE: # BB#0: 540; SSE-NEXT: orps %xmm1, %xmm0 541; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 542; SSE-NEXT: retq 543; 544; AVX-LABEL: combine_bitwise_ops_test2c: 545; AVX: # BB#0: 546; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 547; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 548; AVX-NEXT: retq 549 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 550 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 551 %or = or <4 x i32> %shuf1, %shuf2 552 ret <4 x i32> %or 553} 554 555define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 556; SSE2-LABEL: combine_bitwise_ops_test3c: 557; SSE2: # BB#0: 558; SSE2-NEXT: xorps %xmm1, %xmm0 559; SSE2-NEXT: xorps %xmm1, %xmm1 560; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 561; SSE2-NEXT: retq 562; 563; SSSE3-LABEL: combine_bitwise_ops_test3c: 564; SSSE3: # BB#0: 565; SSSE3-NEXT: xorps %xmm1, %xmm0 566; SSSE3-NEXT: xorps %xmm1, %xmm1 567; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 568; SSSE3-NEXT: retq 569; 570; SSE41-LABEL: combine_bitwise_ops_test3c: 571; SSE41: # BB#0: 572; SSE41-NEXT: xorps %xmm1, %xmm0 573; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 574; SSE41-NEXT: retq 575; 576; AVX-LABEL: combine_bitwise_ops_test3c: 577; AVX: # BB#0: 578; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 579; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 580; AVX-NEXT: retq 581 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 582 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 583 %xor = xor <4 x i32> %shuf1, %shuf2 584 ret <4 x i32> %xor 585} 586 587define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 588; SSE-LABEL: combine_bitwise_ops_test4c: 589; SSE: # BB#0: 590; SSE-NEXT: andps %xmm1, %xmm0 591; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 592; SSE-NEXT: movaps %xmm2, %xmm0 593; SSE-NEXT: retq 594; 595; AVX-LABEL: combine_bitwise_ops_test4c: 596; AVX: # BB#0: 597; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 598; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 599; AVX-NEXT: retq 600 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 601 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 602 %and = and <4 x i32> %shuf1, %shuf2 603 ret <4 x i32> %and 604} 605 606define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 607; SSE-LABEL: combine_bitwise_ops_test5c: 608; SSE: # BB#0: 609; SSE-NEXT: orps %xmm1, %xmm0 610; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 611; SSE-NEXT: movaps %xmm2, %xmm0 612; SSE-NEXT: retq 613; 614; AVX-LABEL: combine_bitwise_ops_test5c: 615; AVX: # BB#0: 616; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 617; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 618; AVX-NEXT: retq 619 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 620 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 621 %or = or <4 x i32> %shuf1, %shuf2 622 ret <4 x i32> %or 623} 624 625define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 626; SSE2-LABEL: combine_bitwise_ops_test6c: 627; SSE2: # BB#0: 628; SSE2-NEXT: xorps %xmm1, %xmm0 629; SSE2-NEXT: xorps %xmm1, %xmm1 630; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 631; SSE2-NEXT: movaps %xmm1, %xmm0 632; SSE2-NEXT: retq 633; 634; SSSE3-LABEL: combine_bitwise_ops_test6c: 635; SSSE3: # BB#0: 636; SSSE3-NEXT: xorps %xmm1, %xmm0 637; SSSE3-NEXT: xorps %xmm1, %xmm1 638; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 639; SSSE3-NEXT: movaps %xmm1, %xmm0 640; SSSE3-NEXT: retq 641; 642; SSE41-LABEL: combine_bitwise_ops_test6c: 643; SSE41: # BB#0: 644; SSE41-NEXT: xorps %xmm1, %xmm0 645; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 646; SSE41-NEXT: retq 647; 648; AVX-LABEL: combine_bitwise_ops_test6c: 649; AVX: # BB#0: 650; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 651; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 652; AVX-NEXT: retq 653 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 654 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 655 %xor = xor <4 x i32> %shuf1, %shuf2 656 ret <4 x i32> %xor 657} 658 659define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 660; SSE-LABEL: combine_nested_undef_test1: 661; SSE: # BB#0: 662; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 663; SSE-NEXT: retq 664; 665; AVX-LABEL: combine_nested_undef_test1: 666; AVX: # BB#0: 667; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 668; AVX-NEXT: retq 669 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 670 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 671 ret <4 x i32> %2 672} 673 674define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 675; SSE-LABEL: combine_nested_undef_test2: 676; SSE: # BB#0: 677; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 678; SSE-NEXT: retq 679; 680; AVX-LABEL: combine_nested_undef_test2: 681; AVX: # BB#0: 682; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 683; AVX-NEXT: retq 684 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 685 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 686 ret <4 x i32> %2 687} 688 689define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 690; SSE-LABEL: combine_nested_undef_test3: 691; SSE: # BB#0: 692; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 693; SSE-NEXT: retq 694; 695; AVX-LABEL: combine_nested_undef_test3: 696; AVX: # BB#0: 697; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 698; AVX-NEXT: retq 699 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 700 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 701 ret <4 x i32> %2 702} 703 704define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 705; SSE-LABEL: combine_nested_undef_test4: 706; SSE: # BB#0: 707; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 708; SSE-NEXT: retq 709; 710; AVX1-LABEL: combine_nested_undef_test4: 711; AVX1: # BB#0: 712; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 713; AVX1-NEXT: retq 714; 715; AVX2-LABEL: combine_nested_undef_test4: 716; AVX2: # BB#0: 717; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 718; AVX2-NEXT: retq 719 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 720 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 721 ret <4 x i32> %2 722} 723 724define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 725; SSE-LABEL: combine_nested_undef_test5: 726; SSE: # BB#0: 727; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 728; SSE-NEXT: retq 729; 730; AVX-LABEL: combine_nested_undef_test5: 731; AVX: # BB#0: 732; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 733; AVX-NEXT: retq 734 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 735 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 736 ret <4 x i32> %2 737} 738 739define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 740; SSE-LABEL: combine_nested_undef_test6: 741; SSE: # BB#0: 742; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 743; SSE-NEXT: retq 744; 745; AVX-LABEL: combine_nested_undef_test6: 746; AVX: # BB#0: 747; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 748; AVX-NEXT: retq 749 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 750 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 751 ret <4 x i32> %2 752} 753 754define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 755; SSE-LABEL: combine_nested_undef_test7: 756; SSE: # BB#0: 757; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 758; SSE-NEXT: retq 759; 760; AVX-LABEL: combine_nested_undef_test7: 761; AVX: # BB#0: 762; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 763; AVX-NEXT: retq 764 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 765 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 766 ret <4 x i32> %2 767} 768 769define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 770; SSE-LABEL: combine_nested_undef_test8: 771; SSE: # BB#0: 772; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 773; SSE-NEXT: retq 774; 775; AVX-LABEL: combine_nested_undef_test8: 776; AVX: # BB#0: 777; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 778; AVX-NEXT: retq 779 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 780 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 781 ret <4 x i32> %2 782} 783 784define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 785; SSE-LABEL: combine_nested_undef_test9: 786; SSE: # BB#0: 787; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 788; SSE-NEXT: retq 789; 790; AVX-LABEL: combine_nested_undef_test9: 791; AVX: # BB#0: 792; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 793; AVX-NEXT: retq 794 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 795 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 796 ret <4 x i32> %2 797} 798 799define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 800; SSE-LABEL: combine_nested_undef_test10: 801; SSE: # BB#0: 802; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 803; SSE-NEXT: retq 804; 805; AVX-LABEL: combine_nested_undef_test10: 806; AVX: # BB#0: 807; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] 808; AVX-NEXT: retq 809 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 810 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 811 ret <4 x i32> %2 812} 813 814define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 815; SSE-LABEL: combine_nested_undef_test11: 816; SSE: # BB#0: 817; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 818; SSE-NEXT: retq 819; 820; AVX-LABEL: combine_nested_undef_test11: 821; AVX: # BB#0: 822; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 823; AVX-NEXT: retq 824 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 825 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 826 ret <4 x i32> %2 827} 828 829define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 830; SSE-LABEL: combine_nested_undef_test12: 831; SSE: # BB#0: 832; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 833; SSE-NEXT: retq 834; 835; AVX1-LABEL: combine_nested_undef_test12: 836; AVX1: # BB#0: 837; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 838; AVX1-NEXT: retq 839; 840; AVX2-LABEL: combine_nested_undef_test12: 841; AVX2: # BB#0: 842; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 843; AVX2-NEXT: retq 844 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 845 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 846 ret <4 x i32> %2 847} 848 849; The following pair of shuffles is folded into vector %A. 850define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 851; ALL-LABEL: combine_nested_undef_test13: 852; ALL: # BB#0: 853; ALL-NEXT: retq 854 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 855 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 856 ret <4 x i32> %2 857} 858 859; The following pair of shuffles is folded into vector %B. 860define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 861; SSE-LABEL: combine_nested_undef_test14: 862; SSE: # BB#0: 863; SSE-NEXT: movaps %xmm1, %xmm0 864; SSE-NEXT: retq 865; 866; AVX-LABEL: combine_nested_undef_test14: 867; AVX: # BB#0: 868; AVX-NEXT: vmovaps %xmm1, %xmm0 869; AVX-NEXT: retq 870 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 871 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 872 ret <4 x i32> %2 873} 874 875 876; Verify that we don't optimize the following cases. We expect more than one shuffle. 877; 878; FIXME: Many of these already don't make sense, and the rest should stop 879; making sense with th enew vector shuffle lowering. Revisit at least testing for 880; it. 881 882define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 883; SSE2-LABEL: combine_nested_undef_test15: 884; SSE2: # BB#0: 885; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 886; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 887; SSE2-NEXT: movaps %xmm1, %xmm0 888; SSE2-NEXT: retq 889; 890; SSSE3-LABEL: combine_nested_undef_test15: 891; SSSE3: # BB#0: 892; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 893; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 894; SSSE3-NEXT: movaps %xmm1, %xmm0 895; SSSE3-NEXT: retq 896; 897; SSE41-LABEL: combine_nested_undef_test15: 898; SSE41: # BB#0: 899; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 900; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 901; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 902; SSE41-NEXT: retq 903; 904; AVX1-LABEL: combine_nested_undef_test15: 905; AVX1: # BB#0: 906; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 907; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 908; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 909; AVX1-NEXT: retq 910; 911; AVX2-LABEL: combine_nested_undef_test15: 912; AVX2: # BB#0: 913; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 914; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 915; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 916; AVX2-NEXT: retq 917 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 918 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 919 ret <4 x i32> %2 920} 921 922define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 923; SSE2-LABEL: combine_nested_undef_test16: 924; SSE2: # BB#0: 925; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 926; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 927; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 928; SSE2-NEXT: retq 929; 930; SSSE3-LABEL: combine_nested_undef_test16: 931; SSSE3: # BB#0: 932; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 933; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 934; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 935; SSSE3-NEXT: retq 936; 937; SSE41-LABEL: combine_nested_undef_test16: 938; SSE41: # BB#0: 939; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 940; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 941; SSE41-NEXT: retq 942; 943; AVX1-LABEL: combine_nested_undef_test16: 944; AVX1: # BB#0: 945; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 946; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 947; AVX1-NEXT: retq 948; 949; AVX2-LABEL: combine_nested_undef_test16: 950; AVX2: # BB#0: 951; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 952; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 953; AVX2-NEXT: retq 954 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 955 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 956 ret <4 x i32> %2 957} 958 959define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 960; SSE2-LABEL: combine_nested_undef_test17: 961; SSE2: # BB#0: 962; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 963; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 964; SSE2-NEXT: retq 965; 966; SSSE3-LABEL: combine_nested_undef_test17: 967; SSSE3: # BB#0: 968; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 969; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 970; SSSE3-NEXT: retq 971; 972; SSE41-LABEL: combine_nested_undef_test17: 973; SSE41: # BB#0: 974; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 975; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 976; SSE41-NEXT: retq 977; 978; AVX1-LABEL: combine_nested_undef_test17: 979; AVX1: # BB#0: 980; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 981; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 982; AVX1-NEXT: retq 983; 984; AVX2-LABEL: combine_nested_undef_test17: 985; AVX2: # BB#0: 986; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 987; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 988; AVX2-NEXT: retq 989 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 990 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 991 ret <4 x i32> %2 992} 993 994define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 995; SSE-LABEL: combine_nested_undef_test18: 996; SSE: # BB#0: 997; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 998; SSE-NEXT: retq 999; 1000; AVX-LABEL: combine_nested_undef_test18: 1001; AVX: # BB#0: 1002; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 1003; AVX-NEXT: retq 1004 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1005 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 1006 ret <4 x i32> %2 1007} 1008 1009define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 1010; SSE2-LABEL: combine_nested_undef_test19: 1011; SSE2: # BB#0: 1012; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1013; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1014; SSE2-NEXT: retq 1015; 1016; SSSE3-LABEL: combine_nested_undef_test19: 1017; SSSE3: # BB#0: 1018; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1019; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 1020; SSSE3-NEXT: retq 1021; 1022; SSE41-LABEL: combine_nested_undef_test19: 1023; SSE41: # BB#0: 1024; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1025; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1026; SSE41-NEXT: retq 1027; 1028; AVX1-LABEL: combine_nested_undef_test19: 1029; AVX1: # BB#0: 1030; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1031; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1032; AVX1-NEXT: retq 1033; 1034; AVX2-LABEL: combine_nested_undef_test19: 1035; AVX2: # BB#0: 1036; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 1037; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 1038; AVX2-NEXT: retq 1039 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 1040 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 1041 ret <4 x i32> %2 1042} 1043 1044define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 1045; SSE2-LABEL: combine_nested_undef_test20: 1046; SSE2: # BB#0: 1047; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1048; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1049; SSE2-NEXT: movaps %xmm1, %xmm0 1050; SSE2-NEXT: retq 1051; 1052; SSSE3-LABEL: combine_nested_undef_test20: 1053; SSSE3: # BB#0: 1054; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1055; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1056; SSSE3-NEXT: movaps %xmm1, %xmm0 1057; SSSE3-NEXT: retq 1058; 1059; SSE41-LABEL: combine_nested_undef_test20: 1060; SSE41: # BB#0: 1061; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1062; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1063; SSE41-NEXT: retq 1064; 1065; AVX1-LABEL: combine_nested_undef_test20: 1066; AVX1: # BB#0: 1067; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1068; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1069; AVX1-NEXT: retq 1070; 1071; AVX2-LABEL: combine_nested_undef_test20: 1072; AVX2: # BB#0: 1073; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1074; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1075; AVX2-NEXT: retq 1076 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1077 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1078 ret <4 x i32> %2 1079} 1080 1081define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1082; SSE2-LABEL: combine_nested_undef_test21: 1083; SSE2: # BB#0: 1084; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1085; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1086; SSE2-NEXT: retq 1087; 1088; SSSE3-LABEL: combine_nested_undef_test21: 1089; SSSE3: # BB#0: 1090; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1091; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1092; SSSE3-NEXT: retq 1093; 1094; SSE41-LABEL: combine_nested_undef_test21: 1095; SSE41: # BB#0: 1096; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1097; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1098; SSE41-NEXT: retq 1099; 1100; AVX1-LABEL: combine_nested_undef_test21: 1101; AVX1: # BB#0: 1102; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1103; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1104; AVX1-NEXT: retq 1105; 1106; AVX2-LABEL: combine_nested_undef_test21: 1107; AVX2: # BB#0: 1108; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1109; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1110; AVX2-NEXT: retq 1111 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1112 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1113 ret <4 x i32> %2 1114} 1115 1116 1117; Test that we correctly combine shuffles according to rule 1118; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1119 1120define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1121; SSE-LABEL: combine_nested_undef_test22: 1122; SSE: # BB#0: 1123; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1124; SSE-NEXT: retq 1125; 1126; AVX-LABEL: combine_nested_undef_test22: 1127; AVX: # BB#0: 1128; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1129; AVX-NEXT: retq 1130 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1131 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1132 ret <4 x i32> %2 1133} 1134 1135define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1136; SSE-LABEL: combine_nested_undef_test23: 1137; SSE: # BB#0: 1138; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1139; SSE-NEXT: retq 1140; 1141; AVX-LABEL: combine_nested_undef_test23: 1142; AVX: # BB#0: 1143; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1144; AVX-NEXT: retq 1145 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1146 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1147 ret <4 x i32> %2 1148} 1149 1150define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1151; SSE-LABEL: combine_nested_undef_test24: 1152; SSE: # BB#0: 1153; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1154; SSE-NEXT: retq 1155; 1156; AVX-LABEL: combine_nested_undef_test24: 1157; AVX: # BB#0: 1158; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1159; AVX-NEXT: retq 1160 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1161 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1162 ret <4 x i32> %2 1163} 1164 1165define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1166; SSE-LABEL: combine_nested_undef_test25: 1167; SSE: # BB#0: 1168; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1169; SSE-NEXT: retq 1170; 1171; AVX1-LABEL: combine_nested_undef_test25: 1172; AVX1: # BB#0: 1173; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1174; AVX1-NEXT: retq 1175; 1176; AVX2-LABEL: combine_nested_undef_test25: 1177; AVX2: # BB#0: 1178; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1179; AVX2-NEXT: retq 1180 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1181 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1182 ret <4 x i32> %2 1183} 1184 1185define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1186; SSE-LABEL: combine_nested_undef_test26: 1187; SSE: # BB#0: 1188; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1189; SSE-NEXT: retq 1190; 1191; AVX-LABEL: combine_nested_undef_test26: 1192; AVX: # BB#0: 1193; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1194; AVX-NEXT: retq 1195 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1196 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1197 ret <4 x i32> %2 1198} 1199 1200define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1201; SSE-LABEL: combine_nested_undef_test27: 1202; SSE: # BB#0: 1203; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1204; SSE-NEXT: retq 1205; 1206; AVX1-LABEL: combine_nested_undef_test27: 1207; AVX1: # BB#0: 1208; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1209; AVX1-NEXT: retq 1210; 1211; AVX2-LABEL: combine_nested_undef_test27: 1212; AVX2: # BB#0: 1213; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 1214; AVX2-NEXT: retq 1215 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1216 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1217 ret <4 x i32> %2 1218} 1219 1220define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1221; SSE-LABEL: combine_nested_undef_test28: 1222; SSE: # BB#0: 1223; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1224; SSE-NEXT: retq 1225; 1226; AVX-LABEL: combine_nested_undef_test28: 1227; AVX: # BB#0: 1228; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1229; AVX-NEXT: retq 1230 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1231 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1232 ret <4 x i32> %2 1233} 1234 1235define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1236; SSE-LABEL: combine_test1: 1237; SSE: # BB#0: 1238; SSE-NEXT: movaps %xmm1, %xmm0 1239; SSE-NEXT: retq 1240; 1241; AVX-LABEL: combine_test1: 1242; AVX: # BB#0: 1243; AVX-NEXT: vmovaps %xmm1, %xmm0 1244; AVX-NEXT: retq 1245 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1246 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1247 ret <4 x float> %2 1248} 1249 1250define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1251; SSE2-LABEL: combine_test2: 1252; SSE2: # BB#0: 1253; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1254; SSE2-NEXT: movaps %xmm1, %xmm0 1255; SSE2-NEXT: retq 1256; 1257; SSSE3-LABEL: combine_test2: 1258; SSSE3: # BB#0: 1259; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1260; SSSE3-NEXT: movaps %xmm1, %xmm0 1261; SSSE3-NEXT: retq 1262; 1263; SSE41-LABEL: combine_test2: 1264; SSE41: # BB#0: 1265; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1266; SSE41-NEXT: retq 1267; 1268; AVX-LABEL: combine_test2: 1269; AVX: # BB#0: 1270; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1271; AVX-NEXT: retq 1272 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1273 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1274 ret <4 x float> %2 1275} 1276 1277define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1278; SSE-LABEL: combine_test3: 1279; SSE: # BB#0: 1280; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1281; SSE-NEXT: retq 1282; 1283; AVX-LABEL: combine_test3: 1284; AVX: # BB#0: 1285; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1286; AVX-NEXT: retq 1287 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1288 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1289 ret <4 x float> %2 1290} 1291 1292define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1293; SSE-LABEL: combine_test4: 1294; SSE: # BB#0: 1295; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1296; SSE-NEXT: retq 1297; 1298; AVX-LABEL: combine_test4: 1299; AVX: # BB#0: 1300; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1301; AVX-NEXT: retq 1302 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1303 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1304 ret <4 x float> %2 1305} 1306 1307define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1308; SSE2-LABEL: combine_test5: 1309; SSE2: # BB#0: 1310; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1311; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1312; SSE2-NEXT: retq 1313; 1314; SSSE3-LABEL: combine_test5: 1315; SSSE3: # BB#0: 1316; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1317; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1318; SSSE3-NEXT: retq 1319; 1320; SSE41-LABEL: combine_test5: 1321; SSE41: # BB#0: 1322; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1323; SSE41-NEXT: retq 1324; 1325; AVX-LABEL: combine_test5: 1326; AVX: # BB#0: 1327; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1328; AVX-NEXT: retq 1329 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1330 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1331 ret <4 x float> %2 1332} 1333 1334define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1335; SSE-LABEL: combine_test6: 1336; SSE: # BB#0: 1337; SSE-NEXT: movaps %xmm1, %xmm0 1338; SSE-NEXT: retq 1339; 1340; AVX-LABEL: combine_test6: 1341; AVX: # BB#0: 1342; AVX-NEXT: vmovaps %xmm1, %xmm0 1343; AVX-NEXT: retq 1344 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1345 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1346 ret <4 x i32> %2 1347} 1348 1349define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1350; SSE2-LABEL: combine_test7: 1351; SSE2: # BB#0: 1352; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1353; SSE2-NEXT: movaps %xmm1, %xmm0 1354; SSE2-NEXT: retq 1355; 1356; SSSE3-LABEL: combine_test7: 1357; SSSE3: # BB#0: 1358; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1359; SSSE3-NEXT: movaps %xmm1, %xmm0 1360; SSSE3-NEXT: retq 1361; 1362; SSE41-LABEL: combine_test7: 1363; SSE41: # BB#0: 1364; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1365; SSE41-NEXT: retq 1366; 1367; AVX1-LABEL: combine_test7: 1368; AVX1: # BB#0: 1369; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1370; AVX1-NEXT: retq 1371; 1372; AVX2-LABEL: combine_test7: 1373; AVX2: # BB#0: 1374; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1375; AVX2-NEXT: retq 1376 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1377 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1378 ret <4 x i32> %2 1379} 1380 1381define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1382; SSE-LABEL: combine_test8: 1383; SSE: # BB#0: 1384; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1385; SSE-NEXT: retq 1386; 1387; AVX-LABEL: combine_test8: 1388; AVX: # BB#0: 1389; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1390; AVX-NEXT: retq 1391 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1392 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1393 ret <4 x i32> %2 1394} 1395 1396define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1397; SSE-LABEL: combine_test9: 1398; SSE: # BB#0: 1399; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1400; SSE-NEXT: movdqa %xmm1, %xmm0 1401; SSE-NEXT: retq 1402; 1403; AVX-LABEL: combine_test9: 1404; AVX: # BB#0: 1405; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1406; AVX-NEXT: retq 1407 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1408 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1409 ret <4 x i32> %2 1410} 1411 1412define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1413; SSE2-LABEL: combine_test10: 1414; SSE2: # BB#0: 1415; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1416; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1417; SSE2-NEXT: retq 1418; 1419; SSSE3-LABEL: combine_test10: 1420; SSSE3: # BB#0: 1421; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1422; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1423; SSSE3-NEXT: retq 1424; 1425; SSE41-LABEL: combine_test10: 1426; SSE41: # BB#0: 1427; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1428; SSE41-NEXT: retq 1429; 1430; AVX1-LABEL: combine_test10: 1431; AVX1: # BB#0: 1432; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1433; AVX1-NEXT: retq 1434; 1435; AVX2-LABEL: combine_test10: 1436; AVX2: # BB#0: 1437; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1438; AVX2-NEXT: retq 1439 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1440 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1441 ret <4 x i32> %2 1442} 1443 1444define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1445; ALL-LABEL: combine_test11: 1446; ALL: # BB#0: 1447; ALL-NEXT: retq 1448 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1449 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1450 ret <4 x float> %2 1451} 1452 1453define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1454; SSE2-LABEL: combine_test12: 1455; SSE2: # BB#0: 1456; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1457; SSE2-NEXT: movaps %xmm1, %xmm0 1458; SSE2-NEXT: retq 1459; 1460; SSSE3-LABEL: combine_test12: 1461; SSSE3: # BB#0: 1462; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1463; SSSE3-NEXT: movaps %xmm1, %xmm0 1464; SSSE3-NEXT: retq 1465; 1466; SSE41-LABEL: combine_test12: 1467; SSE41: # BB#0: 1468; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1469; SSE41-NEXT: retq 1470; 1471; AVX-LABEL: combine_test12: 1472; AVX: # BB#0: 1473; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1474; AVX-NEXT: retq 1475 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1476 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1477 ret <4 x float> %2 1478} 1479 1480define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1481; SSE-LABEL: combine_test13: 1482; SSE: # BB#0: 1483; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1484; SSE-NEXT: retq 1485; 1486; AVX-LABEL: combine_test13: 1487; AVX: # BB#0: 1488; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1489; AVX-NEXT: retq 1490 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1491 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1492 ret <4 x float> %2 1493} 1494 1495define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1496; SSE-LABEL: combine_test14: 1497; SSE: # BB#0: 1498; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1499; SSE-NEXT: retq 1500; 1501; AVX-LABEL: combine_test14: 1502; AVX: # BB#0: 1503; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1504; AVX-NEXT: retq 1505 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1506 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1507 ret <4 x float> %2 1508} 1509 1510define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1511; SSE2-LABEL: combine_test15: 1512; SSE2: # BB#0: 1513; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1514; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1515; SSE2-NEXT: retq 1516; 1517; SSSE3-LABEL: combine_test15: 1518; SSSE3: # BB#0: 1519; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1520; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1521; SSSE3-NEXT: retq 1522; 1523; SSE41-LABEL: combine_test15: 1524; SSE41: # BB#0: 1525; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1526; SSE41-NEXT: retq 1527; 1528; AVX-LABEL: combine_test15: 1529; AVX: # BB#0: 1530; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1531; AVX-NEXT: retq 1532 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1533 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1534 ret <4 x float> %2 1535} 1536 1537define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1538; ALL-LABEL: combine_test16: 1539; ALL: # BB#0: 1540; ALL-NEXT: retq 1541 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1542 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1543 ret <4 x i32> %2 1544} 1545 1546define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1547; SSE2-LABEL: combine_test17: 1548; SSE2: # BB#0: 1549; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1550; SSE2-NEXT: movaps %xmm1, %xmm0 1551; SSE2-NEXT: retq 1552; 1553; SSSE3-LABEL: combine_test17: 1554; SSSE3: # BB#0: 1555; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1556; SSSE3-NEXT: movaps %xmm1, %xmm0 1557; SSSE3-NEXT: retq 1558; 1559; SSE41-LABEL: combine_test17: 1560; SSE41: # BB#0: 1561; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1562; SSE41-NEXT: retq 1563; 1564; AVX1-LABEL: combine_test17: 1565; AVX1: # BB#0: 1566; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1567; AVX1-NEXT: retq 1568; 1569; AVX2-LABEL: combine_test17: 1570; AVX2: # BB#0: 1571; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1572; AVX2-NEXT: retq 1573 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1574 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1575 ret <4 x i32> %2 1576} 1577 1578define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1579; SSE-LABEL: combine_test18: 1580; SSE: # BB#0: 1581; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1582; SSE-NEXT: retq 1583; 1584; AVX-LABEL: combine_test18: 1585; AVX: # BB#0: 1586; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1587; AVX-NEXT: retq 1588 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1589 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1590 ret <4 x i32> %2 1591} 1592 1593define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1594; SSE-LABEL: combine_test19: 1595; SSE: # BB#0: 1596; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1597; SSE-NEXT: retq 1598; 1599; AVX-LABEL: combine_test19: 1600; AVX: # BB#0: 1601; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1602; AVX-NEXT: retq 1603 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1604 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1605 ret <4 x i32> %2 1606} 1607 1608define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1609; SSE2-LABEL: combine_test20: 1610; SSE2: # BB#0: 1611; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1612; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1613; SSE2-NEXT: retq 1614; 1615; SSSE3-LABEL: combine_test20: 1616; SSSE3: # BB#0: 1617; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1618; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1619; SSSE3-NEXT: retq 1620; 1621; SSE41-LABEL: combine_test20: 1622; SSE41: # BB#0: 1623; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1624; SSE41-NEXT: retq 1625; 1626; AVX1-LABEL: combine_test20: 1627; AVX1: # BB#0: 1628; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1629; AVX1-NEXT: retq 1630; 1631; AVX2-LABEL: combine_test20: 1632; AVX2: # BB#0: 1633; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1634; AVX2-NEXT: retq 1635 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1636 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1637 ret <4 x i32> %2 1638} 1639 1640define <4 x i32> @combine_test21(<8 x i32> %a, <4 x i32>* %ptr) { 1641; SSE-LABEL: combine_test21: 1642; SSE: # BB#0: 1643; SSE-NEXT: movdqa %xmm0, %xmm2 1644; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1645; SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1646; SSE-NEXT: movdqa %xmm2, (%rdi) 1647; SSE-NEXT: retq 1648; 1649; AVX1-LABEL: combine_test21: 1650; AVX1: # BB#0: 1651; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1652; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1653; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1654; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1655; AVX1-NEXT: vzeroupper 1656; AVX1-NEXT: retq 1657; 1658; AVX2-LABEL: combine_test21: 1659; AVX2: # BB#0: 1660; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1661; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1662; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1663; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1664; AVX2-NEXT: vzeroupper 1665; AVX2-NEXT: retq 1666 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1667 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1668 store <4 x i32> %1, <4 x i32>* %ptr, align 16 1669 ret <4 x i32> %2 1670} 1671 1672define <8 x float> @combine_test22(<2 x float>* %a, <2 x float>* %b) { 1673; SSE-LABEL: combine_test22: 1674; SSE: # BB#0: 1675; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1676; SSE-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1677; SSE-NEXT: retq 1678; 1679; AVX-LABEL: combine_test22: 1680; AVX: # BB#0: 1681; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1682; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] 1683; AVX-NEXT: retq 1684; Current AVX2 lowering of this is still awful, not adding a test case. 1685 %1 = load <2 x float>, <2 x float>* %a, align 8 1686 %2 = load <2 x float>, <2 x float>* %b, align 8 1687 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1688 ret <8 x float> %3 1689} 1690 1691; PR22359 1692define void @combine_test23(<8 x float> %v, <2 x float>* %ptr) { 1693; SSE-LABEL: combine_test23: 1694; SSE: # BB#0: 1695; SSE-NEXT: movups %xmm0, (%rdi) 1696; SSE-NEXT: retq 1697; 1698; AVX-LABEL: combine_test23: 1699; AVX: # BB#0: 1700; AVX-NEXT: vmovups %xmm0, (%rdi) 1701; AVX-NEXT: vzeroupper 1702; AVX-NEXT: retq 1703 %idx2 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 1 1704 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1705 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1706 store <2 x float> %shuffle0, <2 x float>* %ptr, align 8 1707 store <2 x float> %shuffle1, <2 x float>* %idx2, align 8 1708 ret void 1709} 1710 1711; Check some negative cases. 1712; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1713 1714define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1715; SSE-LABEL: combine_test1b: 1716; SSE: # BB#0: 1717; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,0] 1718; SSE-NEXT: movaps %xmm1, %xmm0 1719; SSE-NEXT: retq 1720; 1721; AVX-LABEL: combine_test1b: 1722; AVX: # BB#0: 1723; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1724; AVX-NEXT: retq 1725 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1726 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1727 ret <4 x float> %2 1728} 1729 1730define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1731; SSE2-LABEL: combine_test2b: 1732; SSE2: # BB#0: 1733; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0,0] 1734; SSE2-NEXT: movaps %xmm1, %xmm0 1735; SSE2-NEXT: retq 1736; 1737; SSSE3-LABEL: combine_test2b: 1738; SSSE3: # BB#0: 1739; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1740; SSSE3-NEXT: retq 1741; 1742; SSE41-LABEL: combine_test2b: 1743; SSE41: # BB#0: 1744; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1745; SSE41-NEXT: retq 1746; 1747; AVX-LABEL: combine_test2b: 1748; AVX: # BB#0: 1749; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1750; AVX-NEXT: retq 1751 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1752 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1753 ret <4 x float> %2 1754} 1755 1756define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1757; SSE2-LABEL: combine_test3b: 1758; SSE2: # BB#0: 1759; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1760; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1761; SSE2-NEXT: retq 1762; 1763; SSSE3-LABEL: combine_test3b: 1764; SSSE3: # BB#0: 1765; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1766; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1767; SSSE3-NEXT: retq 1768; 1769; SSE41-LABEL: combine_test3b: 1770; SSE41: # BB#0: 1771; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1772; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1773; SSE41-NEXT: retq 1774; 1775; AVX-LABEL: combine_test3b: 1776; AVX: # BB#0: 1777; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 1778; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1779; AVX-NEXT: retq 1780 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1781 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1782 ret <4 x float> %2 1783} 1784 1785define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1786; SSE-LABEL: combine_test4b: 1787; SSE: # BB#0: 1788; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] 1789; SSE-NEXT: movaps %xmm1, %xmm0 1790; SSE-NEXT: retq 1791; 1792; AVX-LABEL: combine_test4b: 1793; AVX: # BB#0: 1794; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1795; AVX-NEXT: retq 1796 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1797 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1798 ret <4 x float> %2 1799} 1800 1801 1802; Verify that we correctly fold shuffles even when we use illegal vector types. 1803 1804define <4 x i8> @combine_test1c(<4 x i8>* %a, <4 x i8>* %b) { 1805; SSE2-LABEL: combine_test1c: 1806; SSE2: # BB#0: 1807; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1808; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1809; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1810; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1811; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1812; SSE2-NEXT: retq 1813; 1814; SSSE3-LABEL: combine_test1c: 1815; SSSE3: # BB#0: 1816; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1817; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1818; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1819; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1820; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 1821; SSSE3-NEXT: retq 1822; 1823; SSE41-LABEL: combine_test1c: 1824; SSE41: # BB#0: 1825; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1826; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1827; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 1828; SSE41-NEXT: retq 1829; 1830; AVX1-LABEL: combine_test1c: 1831; AVX1: # BB#0: 1832; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1833; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1834; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] 1835; AVX1-NEXT: retq 1836; 1837; AVX2-LABEL: combine_test1c: 1838; AVX2: # BB#0: 1839; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1840; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1841; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1842; AVX2-NEXT: retq 1843 %A = load <4 x i8>, <4 x i8>* %a 1844 %B = load <4 x i8>, <4 x i8>* %b 1845 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1846 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1847 ret <4 x i8> %2 1848} 1849 1850define <4 x i8> @combine_test2c(<4 x i8>* %a, <4 x i8>* %b) { 1851; SSE2-LABEL: combine_test2c: 1852; SSE2: # BB#0: 1853; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1854; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1855; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1856; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1857; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1858; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1859; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1860; SSE2-NEXT: retq 1861; 1862; SSSE3-LABEL: combine_test2c: 1863; SSSE3: # BB#0: 1864; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1865; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1866; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1867; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1868; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1869; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1870; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1871; SSSE3-NEXT: retq 1872; 1873; SSE41-LABEL: combine_test2c: 1874; SSE41: # BB#0: 1875; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1876; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1877; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1878; SSE41-NEXT: retq 1879; 1880; AVX-LABEL: combine_test2c: 1881; AVX: # BB#0: 1882; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1883; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1884; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1885; AVX-NEXT: retq 1886 %A = load <4 x i8>, <4 x i8>* %a 1887 %B = load <4 x i8>, <4 x i8>* %b 1888 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1889 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1890 ret <4 x i8> %2 1891} 1892 1893define <4 x i8> @combine_test3c(<4 x i8>* %a, <4 x i8>* %b) { 1894; SSE2-LABEL: combine_test3c: 1895; SSE2: # BB#0: 1896; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1897; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1898; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1899; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1900; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1901; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1902; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1903; SSE2-NEXT: retq 1904; 1905; SSSE3-LABEL: combine_test3c: 1906; SSSE3: # BB#0: 1907; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1908; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1909; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1910; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1911; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1912; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1913; SSSE3-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1914; SSSE3-NEXT: retq 1915; 1916; SSE41-LABEL: combine_test3c: 1917; SSE41: # BB#0: 1918; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1919; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1920; SSE41-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1921; SSE41-NEXT: retq 1922; 1923; AVX-LABEL: combine_test3c: 1924; AVX: # BB#0: 1925; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1926; AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1927; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1928; AVX-NEXT: retq 1929 %A = load <4 x i8>, <4 x i8>* %a 1930 %B = load <4 x i8>, <4 x i8>* %b 1931 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1932 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1933 ret <4 x i8> %2 1934} 1935 1936define <4 x i8> @combine_test4c(<4 x i8>* %a, <4 x i8>* %b) { 1937; SSE2-LABEL: combine_test4c: 1938; SSE2: # BB#0: 1939; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1940; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1941; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1942; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1943; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1944; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1945; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1946; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1947; SSE2-NEXT: retq 1948; 1949; SSSE3-LABEL: combine_test4c: 1950; SSSE3: # BB#0: 1951; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1952; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1953; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1954; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1955; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 1956; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1957; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1958; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1959; SSSE3-NEXT: retq 1960; 1961; SSE41-LABEL: combine_test4c: 1962; SSE41: # BB#0: 1963; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1964; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1965; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 1966; SSE41-NEXT: retq 1967; 1968; AVX1-LABEL: combine_test4c: 1969; AVX1: # BB#0: 1970; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1971; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1972; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1973; AVX1-NEXT: retq 1974; 1975; AVX2-LABEL: combine_test4c: 1976; AVX2: # BB#0: 1977; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1978; AVX2-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 1979; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1980; AVX2-NEXT: retq 1981 %A = load <4 x i8>, <4 x i8>* %a 1982 %B = load <4 x i8>, <4 x i8>* %b 1983 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1984 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1985 ret <4 x i8> %2 1986} 1987 1988 1989; The following test cases are generated from this C++ code 1990; 1991;__m128 blend_01(__m128 a, __m128 b) 1992;{ 1993; __m128 s = a; 1994; s = _mm_blend_ps( s, b, 1<<0 ); 1995; s = _mm_blend_ps( s, b, 1<<1 ); 1996; return s; 1997;} 1998; 1999;__m128 blend_02(__m128 a, __m128 b) 2000;{ 2001; __m128 s = a; 2002; s = _mm_blend_ps( s, b, 1<<0 ); 2003; s = _mm_blend_ps( s, b, 1<<2 ); 2004; return s; 2005;} 2006; 2007;__m128 blend_123(__m128 a, __m128 b) 2008;{ 2009; __m128 s = a; 2010; s = _mm_blend_ps( s, b, 1<<1 ); 2011; s = _mm_blend_ps( s, b, 1<<2 ); 2012; s = _mm_blend_ps( s, b, 1<<3 ); 2013; return s; 2014;} 2015 2016; Ideally, we should collapse the following shuffles into a single one. 2017 2018define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 2019; SSE2-LABEL: combine_blend_01: 2020; SSE2: # BB#0: 2021; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2022; SSE2-NEXT: retq 2023; 2024; SSSE3-LABEL: combine_blend_01: 2025; SSSE3: # BB#0: 2026; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2027; SSSE3-NEXT: retq 2028; 2029; SSE41-LABEL: combine_blend_01: 2030; SSE41: # BB#0: 2031; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2032; SSE41-NEXT: retq 2033; 2034; AVX-LABEL: combine_blend_01: 2035; AVX: # BB#0: 2036; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2037; AVX-NEXT: retq 2038 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 2039 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 2040 ret <4 x float> %shuffle6 2041} 2042 2043define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 2044; SSE2-LABEL: combine_blend_02: 2045; SSE2: # BB#0: 2046; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2047; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2048; SSE2-NEXT: movaps %xmm1, %xmm0 2049; SSE2-NEXT: retq 2050; 2051; SSSE3-LABEL: combine_blend_02: 2052; SSSE3: # BB#0: 2053; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 2054; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 2055; SSSE3-NEXT: movaps %xmm1, %xmm0 2056; SSSE3-NEXT: retq 2057; 2058; SSE41-LABEL: combine_blend_02: 2059; SSE41: # BB#0: 2060; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2061; SSE41-NEXT: retq 2062; 2063; AVX-LABEL: combine_blend_02: 2064; AVX: # BB#0: 2065; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 2066; AVX-NEXT: retq 2067 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 2068 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 2069 ret <4 x float> %shuffle6 2070} 2071 2072define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 2073; SSE2-LABEL: combine_blend_123: 2074; SSE2: # BB#0: 2075; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2076; SSE2-NEXT: movaps %xmm1, %xmm0 2077; SSE2-NEXT: retq 2078; 2079; SSSE3-LABEL: combine_blend_123: 2080; SSSE3: # BB#0: 2081; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2082; SSSE3-NEXT: movaps %xmm1, %xmm0 2083; SSSE3-NEXT: retq 2084; 2085; SSE41-LABEL: combine_blend_123: 2086; SSE41: # BB#0: 2087; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2088; SSE41-NEXT: retq 2089; 2090; AVX-LABEL: combine_blend_123: 2091; AVX: # BB#0: 2092; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 2093; AVX-NEXT: retq 2094 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2095 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 2096 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 2097 ret <4 x float> %shuffle12 2098} 2099 2100define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 2101; SSE-LABEL: combine_test_movhl_1: 2102; SSE: # BB#0: 2103; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2104; SSE-NEXT: movdqa %xmm1, %xmm0 2105; SSE-NEXT: retq 2106; 2107; AVX-LABEL: combine_test_movhl_1: 2108; AVX: # BB#0: 2109; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2110; AVX-NEXT: retq 2111 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 2112 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 2113 ret <4 x i32> %2 2114} 2115 2116define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 2117; SSE-LABEL: combine_test_movhl_2: 2118; SSE: # BB#0: 2119; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2120; SSE-NEXT: movdqa %xmm1, %xmm0 2121; SSE-NEXT: retq 2122; 2123; AVX-LABEL: combine_test_movhl_2: 2124; AVX: # BB#0: 2125; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2126; AVX-NEXT: retq 2127 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 2128 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 2129 ret <4 x i32> %2 2130} 2131 2132define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 2133; SSE-LABEL: combine_test_movhl_3: 2134; SSE: # BB#0: 2135; SSE-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm0[1] 2136; SSE-NEXT: movdqa %xmm1, %xmm0 2137; SSE-NEXT: retq 2138; 2139; AVX-LABEL: combine_test_movhl_3: 2140; AVX: # BB#0: 2141; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2142; AVX-NEXT: retq 2143 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 2144 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2145 ret <4 x i32> %2 2146} 2147 2148 2149; Verify that we fold shuffles according to rule: 2150; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2151 2152define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2153; SSE2-LABEL: combine_undef_input_test1: 2154; SSE2: # BB#0: 2155; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2156; SSE2-NEXT: retq 2157; 2158; SSSE3-LABEL: combine_undef_input_test1: 2159; SSSE3: # BB#0: 2160; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2161; SSSE3-NEXT: retq 2162; 2163; SSE41-LABEL: combine_undef_input_test1: 2164; SSE41: # BB#0: 2165; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2166; SSE41-NEXT: retq 2167; 2168; AVX-LABEL: combine_undef_input_test1: 2169; AVX: # BB#0: 2170; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2171; AVX-NEXT: retq 2172 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2173 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2174 ret <4 x float> %2 2175} 2176 2177define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2178; SSE-LABEL: combine_undef_input_test2: 2179; SSE: # BB#0: 2180; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2181; SSE-NEXT: retq 2182; 2183; AVX-LABEL: combine_undef_input_test2: 2184; AVX: # BB#0: 2185; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2186; AVX-NEXT: retq 2187 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2188 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2189 ret <4 x float> %2 2190} 2191 2192define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2193; SSE-LABEL: combine_undef_input_test3: 2194; SSE: # BB#0: 2195; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2196; SSE-NEXT: retq 2197; 2198; AVX-LABEL: combine_undef_input_test3: 2199; AVX: # BB#0: 2200; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2201; AVX-NEXT: retq 2202 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2203 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2204 ret <4 x float> %2 2205} 2206 2207define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2208; SSE-LABEL: combine_undef_input_test4: 2209; SSE: # BB#0: 2210; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2211; SSE-NEXT: retq 2212; 2213; AVX-LABEL: combine_undef_input_test4: 2214; AVX: # BB#0: 2215; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2216; AVX-NEXT: retq 2217 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2218 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2219 ret <4 x float> %2 2220} 2221 2222define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2223; SSE2-LABEL: combine_undef_input_test5: 2224; SSE2: # BB#0: 2225; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2226; SSE2-NEXT: movapd %xmm1, %xmm0 2227; SSE2-NEXT: retq 2228; 2229; SSSE3-LABEL: combine_undef_input_test5: 2230; SSSE3: # BB#0: 2231; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2232; SSSE3-NEXT: movapd %xmm1, %xmm0 2233; SSSE3-NEXT: retq 2234; 2235; SSE41-LABEL: combine_undef_input_test5: 2236; SSE41: # BB#0: 2237; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2238; SSE41-NEXT: retq 2239; 2240; AVX-LABEL: combine_undef_input_test5: 2241; AVX: # BB#0: 2242; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2243; AVX-NEXT: retq 2244 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2245 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2246 ret <4 x float> %2 2247} 2248 2249 2250; Verify that we fold shuffles according to rule: 2251; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2252 2253define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2254; ALL-LABEL: combine_undef_input_test6: 2255; ALL: # BB#0: 2256; ALL-NEXT: retq 2257 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2258 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2259 ret <4 x float> %2 2260} 2261 2262define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2263; SSE2-LABEL: combine_undef_input_test7: 2264; SSE2: # BB#0: 2265; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2266; SSE2-NEXT: retq 2267; 2268; SSSE3-LABEL: combine_undef_input_test7: 2269; SSSE3: # BB#0: 2270; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2271; SSSE3-NEXT: retq 2272; 2273; SSE41-LABEL: combine_undef_input_test7: 2274; SSE41: # BB#0: 2275; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2276; SSE41-NEXT: retq 2277; 2278; AVX-LABEL: combine_undef_input_test7: 2279; AVX: # BB#0: 2280; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2281; AVX-NEXT: retq 2282 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2283 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2284 ret <4 x float> %2 2285} 2286 2287define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2288; SSE2-LABEL: combine_undef_input_test8: 2289; SSE2: # BB#0: 2290; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2291; SSE2-NEXT: retq 2292; 2293; SSSE3-LABEL: combine_undef_input_test8: 2294; SSSE3: # BB#0: 2295; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2296; SSSE3-NEXT: retq 2297; 2298; SSE41-LABEL: combine_undef_input_test8: 2299; SSE41: # BB#0: 2300; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2301; SSE41-NEXT: retq 2302; 2303; AVX-LABEL: combine_undef_input_test8: 2304; AVX: # BB#0: 2305; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2306; AVX-NEXT: retq 2307 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2308 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2309 ret <4 x float> %2 2310} 2311 2312define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2313; SSE-LABEL: combine_undef_input_test9: 2314; SSE: # BB#0: 2315; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 2316; SSE-NEXT: retq 2317; 2318; AVX-LABEL: combine_undef_input_test9: 2319; AVX: # BB#0: 2320; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2321; AVX-NEXT: retq 2322 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2323 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2324 ret <4 x float> %2 2325} 2326 2327define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2328; ALL-LABEL: combine_undef_input_test10: 2329; ALL: # BB#0: 2330; ALL-NEXT: retq 2331 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2332 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2333 ret <4 x float> %2 2334} 2335 2336define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2337; SSE2-LABEL: combine_undef_input_test11: 2338; SSE2: # BB#0: 2339; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2340; SSE2-NEXT: retq 2341; 2342; SSSE3-LABEL: combine_undef_input_test11: 2343; SSSE3: # BB#0: 2344; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2345; SSSE3-NEXT: retq 2346; 2347; SSE41-LABEL: combine_undef_input_test11: 2348; SSE41: # BB#0: 2349; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2350; SSE41-NEXT: retq 2351; 2352; AVX-LABEL: combine_undef_input_test11: 2353; AVX: # BB#0: 2354; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2355; AVX-NEXT: retq 2356 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2357 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2358 ret <4 x float> %2 2359} 2360 2361define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2362; SSE-LABEL: combine_undef_input_test12: 2363; SSE: # BB#0: 2364; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2365; SSE-NEXT: retq 2366; 2367; AVX-LABEL: combine_undef_input_test12: 2368; AVX: # BB#0: 2369; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2370; AVX-NEXT: retq 2371 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2372 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2373 ret <4 x float> %2 2374} 2375 2376define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2377; SSE-LABEL: combine_undef_input_test13: 2378; SSE: # BB#0: 2379; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2380; SSE-NEXT: retq 2381; 2382; AVX-LABEL: combine_undef_input_test13: 2383; AVX: # BB#0: 2384; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2385; AVX-NEXT: retq 2386 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2387 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2388 ret <4 x float> %2 2389} 2390 2391define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2392; SSE-LABEL: combine_undef_input_test14: 2393; SSE: # BB#0: 2394; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2395; SSE-NEXT: retq 2396; 2397; AVX-LABEL: combine_undef_input_test14: 2398; AVX: # BB#0: 2399; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2400; AVX-NEXT: retq 2401 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2402 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2403 ret <4 x float> %2 2404} 2405 2406define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2407; SSE2-LABEL: combine_undef_input_test15: 2408; SSE2: # BB#0: 2409; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2410; SSE2-NEXT: movapd %xmm1, %xmm0 2411; SSE2-NEXT: retq 2412; 2413; SSSE3-LABEL: combine_undef_input_test15: 2414; SSSE3: # BB#0: 2415; SSSE3-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 2416; SSSE3-NEXT: movapd %xmm1, %xmm0 2417; SSSE3-NEXT: retq 2418; 2419; SSE41-LABEL: combine_undef_input_test15: 2420; SSE41: # BB#0: 2421; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2422; SSE41-NEXT: retq 2423; 2424; AVX-LABEL: combine_undef_input_test15: 2425; AVX: # BB#0: 2426; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] 2427; AVX-NEXT: retq 2428 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2429 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2430 ret <4 x float> %2 2431} 2432 2433 2434; Verify that shuffles are canonicalized according to rules: 2435; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2436; 2437; This allows to trigger the following combine rule: 2438; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2439; 2440; As a result, all the shuffle pairs in each function below should be 2441; combined into a single legal shuffle operation. 2442 2443define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2444; ALL-LABEL: combine_undef_input_test16: 2445; ALL: # BB#0: 2446; ALL-NEXT: retq 2447 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2448 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2449 ret <4 x float> %2 2450} 2451 2452define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2453; SSE2-LABEL: combine_undef_input_test17: 2454; SSE2: # BB#0: 2455; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2456; SSE2-NEXT: retq 2457; 2458; SSSE3-LABEL: combine_undef_input_test17: 2459; SSSE3: # BB#0: 2460; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2461; SSSE3-NEXT: retq 2462; 2463; SSE41-LABEL: combine_undef_input_test17: 2464; SSE41: # BB#0: 2465; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2466; SSE41-NEXT: retq 2467; 2468; AVX-LABEL: combine_undef_input_test17: 2469; AVX: # BB#0: 2470; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2471; AVX-NEXT: retq 2472 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2473 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2474 ret <4 x float> %2 2475} 2476 2477define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2478; SSE2-LABEL: combine_undef_input_test18: 2479; SSE2: # BB#0: 2480; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2481; SSE2-NEXT: retq 2482; 2483; SSSE3-LABEL: combine_undef_input_test18: 2484; SSSE3: # BB#0: 2485; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2486; SSSE3-NEXT: retq 2487; 2488; SSE41-LABEL: combine_undef_input_test18: 2489; SSE41: # BB#0: 2490; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2491; SSE41-NEXT: retq 2492; 2493; AVX-LABEL: combine_undef_input_test18: 2494; AVX: # BB#0: 2495; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2496; AVX-NEXT: retq 2497 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2498 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2499 ret <4 x float> %2 2500} 2501 2502define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2503; SSE-LABEL: combine_undef_input_test19: 2504; SSE: # BB#0: 2505; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] 2506; SSE-NEXT: retq 2507; 2508; AVX-LABEL: combine_undef_input_test19: 2509; AVX: # BB#0: 2510; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2511; AVX-NEXT: retq 2512 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2513 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2514 ret <4 x float> %2 2515} 2516 2517define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2518; ALL-LABEL: combine_undef_input_test20: 2519; ALL: # BB#0: 2520; ALL-NEXT: retq 2521 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2522 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2523 ret <4 x float> %2 2524} 2525 2526; These tests are designed to test the ability to combine away unnecessary 2527; operations feeding into a shuffle. The AVX cases are the important ones as 2528; they leverage operations which cannot be done naturally on the entire vector 2529; and thus are decomposed into multiple smaller operations. 2530 2531define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2532; SSE-LABEL: combine_unneeded_subvector1: 2533; SSE: # BB#0: 2534; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2535; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2536; SSE-NEXT: movdqa %xmm0, %xmm1 2537; SSE-NEXT: retq 2538; 2539; AVX1-LABEL: combine_unneeded_subvector1: 2540; AVX1: # BB#0: 2541; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2542; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2543; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2544; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2545; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2546; AVX1-NEXT: retq 2547; 2548; AVX2-LABEL: combine_unneeded_subvector1: 2549; AVX2: # BB#0: 2550; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2551; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2552; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3] 2553; AVX2-NEXT: retq 2554 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2555 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2556 ret <8 x i32> %c 2557} 2558 2559define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2560; SSE-LABEL: combine_unneeded_subvector2: 2561; SSE: # BB#0: 2562; SSE-NEXT: paddd {{.*}}(%rip), %xmm1 2563; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2564; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2565; SSE-NEXT: retq 2566; 2567; AVX1-LABEL: combine_unneeded_subvector2: 2568; AVX1: # BB#0: 2569; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2570; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 2571; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2572; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2573; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2574; AVX1-NEXT: retq 2575; 2576; AVX2-LABEL: combine_unneeded_subvector2: 2577; AVX2: # BB#0: 2578; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 2579; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2580; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2581; AVX2-NEXT: retq 2582 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2583 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2584 ret <8 x i32> %d 2585} 2586 2587define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2588; SSE2-LABEL: combine_insertps1: 2589; SSE2: # BB#0: 2590; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2591; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2592; SSE2-NEXT: movaps %xmm1, %xmm0 2593; SSE2-NEXT: retq 2594; 2595; SSSE3-LABEL: combine_insertps1: 2596; SSSE3: # BB#0: 2597; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2598; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2599; SSSE3-NEXT: movaps %xmm1, %xmm0 2600; SSSE3-NEXT: retq 2601; 2602; SSE41-LABEL: combine_insertps1: 2603; SSE41: # BB#0: 2604; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2605; SSE41-NEXT: retq 2606; 2607; AVX-LABEL: combine_insertps1: 2608; AVX: # BB#0: 2609; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2610; AVX-NEXT: retq 2611 2612 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2613 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2614 ret <4 x float> %d 2615} 2616 2617define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2618; SSE2-LABEL: combine_insertps2: 2619; SSE2: # BB#0: 2620; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2621; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2622; SSE2-NEXT: movaps %xmm1, %xmm0 2623; SSE2-NEXT: retq 2624; 2625; SSSE3-LABEL: combine_insertps2: 2626; SSSE3: # BB#0: 2627; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2628; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2629; SSSE3-NEXT: movaps %xmm1, %xmm0 2630; SSSE3-NEXT: retq 2631; 2632; SSE41-LABEL: combine_insertps2: 2633; SSE41: # BB#0: 2634; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2635; SSE41-NEXT: retq 2636; 2637; AVX-LABEL: combine_insertps2: 2638; AVX: # BB#0: 2639; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2640; AVX-NEXT: retq 2641 2642 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2643 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2644 ret <4 x float> %d 2645} 2646 2647define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2648; SSE2-LABEL: combine_insertps3: 2649; SSE2: # BB#0: 2650; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2651; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2652; SSE2-NEXT: retq 2653; 2654; SSSE3-LABEL: combine_insertps3: 2655; SSSE3: # BB#0: 2656; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2657; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2658; SSSE3-NEXT: retq 2659; 2660; SSE41-LABEL: combine_insertps3: 2661; SSE41: # BB#0: 2662; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2663; SSE41-NEXT: retq 2664; 2665; AVX-LABEL: combine_insertps3: 2666; AVX: # BB#0: 2667; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2668; AVX-NEXT: retq 2669 2670 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2671 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2672 ret <4 x float> %d 2673} 2674 2675define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2676; SSE2-LABEL: combine_insertps4: 2677; SSE2: # BB#0: 2678; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2679; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2680; SSE2-NEXT: retq 2681; 2682; SSSE3-LABEL: combine_insertps4: 2683; SSSE3: # BB#0: 2684; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] 2685; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2686; SSSE3-NEXT: retq 2687; 2688; SSE41-LABEL: combine_insertps4: 2689; SSE41: # BB#0: 2690; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2691; SSE41-NEXT: retq 2692; 2693; AVX-LABEL: combine_insertps4: 2694; AVX: # BB#0: 2695; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2696; AVX-NEXT: retq 2697 2698 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2699 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2700 ret <4 x float> %d 2701} 2702 2703define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) { 2704; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2705; SSE: # BB#0: 2706; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2707; SSE-NEXT: movaps %xmm0, (%rsi) 2708; SSE-NEXT: retq 2709; 2710; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2711; AVX: # BB#0: 2712; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2713; AVX-NEXT: vmovaps %xmm0, (%rsi) 2714; AVX-NEXT: retq 2715 %1 = load double, double* %a0, align 8 2716 %2 = insertelement <2 x double> undef, double %1, i32 0 2717 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2718 %4 = bitcast <2 x double> %3 to <4 x float> 2719 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2720 store <4 x float> %5, <4 x float>* %a1, align 16 2721 ret void 2722} 2723 2724; PR30371 2725define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2726; SSE2-LABEL: combine_constant_insertion_v4f32: 2727; SSE2: # BB#0: 2728; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3> 2729; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2730; SSE2-NEXT: movaps %xmm1, %xmm0 2731; SSE2-NEXT: retq 2732; 2733; SSSE3-LABEL: combine_constant_insertion_v4f32: 2734; SSSE3: # BB#0: 2735; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4,5,3> 2736; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2737; SSSE3-NEXT: movaps %xmm1, %xmm0 2738; SSSE3-NEXT: retq 2739; 2740; SSE41-LABEL: combine_constant_insertion_v4f32: 2741; SSE41: # BB#0: 2742; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2743; SSE41-NEXT: retq 2744; 2745; AVX-LABEL: combine_constant_insertion_v4f32: 2746; AVX: # BB#0: 2747; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2748; AVX-NEXT: retq 2749 %a0 = insertelement <4 x float> undef, float %f, i32 0 2750 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2751 ret <4 x float> %ret 2752} 2753 2754define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2755; SSE2-LABEL: combine_constant_insertion_v4i32: 2756; SSE2: # BB#0: 2757; SSE2-NEXT: movd %edi, %xmm1 2758; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2759; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2760; SSE2-NEXT: retq 2761; 2762; SSSE3-LABEL: combine_constant_insertion_v4i32: 2763; SSSE3: # BB#0: 2764; SSSE3-NEXT: movd %edi, %xmm1 2765; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2766; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2767; SSSE3-NEXT: retq 2768; 2769; SSE41-LABEL: combine_constant_insertion_v4i32: 2770; SSE41: # BB#0: 2771; SSE41-NEXT: movd %edi, %xmm0 2772; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] 2773; SSE41-NEXT: retq 2774; 2775; AVX1-LABEL: combine_constant_insertion_v4i32: 2776; AVX1: # BB#0: 2777; AVX1-NEXT: vmovd %edi, %xmm0 2778; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] 2779; AVX1-NEXT: retq 2780; 2781; AVX2-LABEL: combine_constant_insertion_v4i32: 2782; AVX2: # BB#0: 2783; AVX2-NEXT: vmovd %edi, %xmm0 2784; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2785; AVX2-NEXT: retq 2786 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2787 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2788 ret <4 x i32> %ret 2789} 2790 2791define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2792; SSE-LABEL: PR22377: 2793; SSE: # BB#0: # %entry 2794; SSE-NEXT: movaps %xmm0, %xmm1 2795; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3,1,3] 2796; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2797; SSE-NEXT: addps %xmm0, %xmm1 2798; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2799; SSE-NEXT: retq 2800; 2801; AVX-LABEL: PR22377: 2802; AVX: # BB#0: # %entry 2803; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] 2804; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2805; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm1 2806; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2807; AVX-NEXT: retq 2808entry: 2809 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2810 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2811 %r2 = fadd <4 x float> %s1, %s2 2812 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2813 ret <4 x float> %s3 2814} 2815 2816define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2817; SSE2-LABEL: PR22390: 2818; SSE2: # BB#0: # %entry 2819; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2820; SSE2-NEXT: movaps %xmm0, %xmm2 2821; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2822; SSE2-NEXT: addps %xmm0, %xmm2 2823; SSE2-NEXT: movaps %xmm2, %xmm0 2824; SSE2-NEXT: retq 2825; 2826; SSSE3-LABEL: PR22390: 2827; SSSE3: # BB#0: # %entry 2828; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2829; SSSE3-NEXT: movaps %xmm0, %xmm2 2830; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2831; SSSE3-NEXT: addps %xmm0, %xmm2 2832; SSSE3-NEXT: movaps %xmm2, %xmm0 2833; SSSE3-NEXT: retq 2834; 2835; SSE41-LABEL: PR22390: 2836; SSE41: # BB#0: # %entry 2837; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2838; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2839; SSE41-NEXT: addps %xmm1, %xmm0 2840; SSE41-NEXT: retq 2841; 2842; AVX-LABEL: PR22390: 2843; AVX: # BB#0: # %entry 2844; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2845; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2846; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2847; AVX-NEXT: retq 2848entry: 2849 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2850 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2851 %r2 = fadd <4 x float> %s1, %s2 2852 ret <4 x float> %r2 2853} 2854 2855define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2856; SSE2-LABEL: PR22412: 2857; SSE2: # BB#0: # %entry 2858; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2859; SSE2-NEXT: movapd %xmm2, %xmm0 2860; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2861; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2862; SSE2-NEXT: movaps %xmm3, %xmm1 2863; SSE2-NEXT: retq 2864; 2865; SSSE3-LABEL: PR22412: 2866; SSSE3: # BB#0: # %entry 2867; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] 2868; SSSE3-NEXT: movapd %xmm2, %xmm0 2869; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2870; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] 2871; SSSE3-NEXT: movaps %xmm3, %xmm1 2872; SSSE3-NEXT: retq 2873; 2874; SSE41-LABEL: PR22412: 2875; SSE41: # BB#0: # %entry 2876; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm2[1] 2877; SSE41-NEXT: movapd %xmm0, %xmm1 2878; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2] 2879; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2] 2880; SSE41-NEXT: movaps %xmm1, %xmm0 2881; SSE41-NEXT: movaps %xmm3, %xmm1 2882; SSE41-NEXT: retq 2883; 2884; AVX1-LABEL: PR22412: 2885; AVX1: # BB#0: # %entry 2886; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2887; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] 2888; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2889; AVX1-NEXT: retq 2890; 2891; AVX2-LABEL: PR22412: 2892; AVX2: # BB#0: # %entry 2893; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] 2894; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] 2895; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] 2896; AVX2-NEXT: retq 2897entry: 2898 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2899 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2900 ret <8 x float> %s2 2901} 2902