1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL 8; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 9; 10; Verify that the DAG combiner correctly folds bitwise operations across 11; shuffles, nested shuffles with undef, pairs of nested shuffles, and other 12; basic and always-safe patterns. Also test that the DAG combiner will combine 13; target-specific shuffle instructions where reasonable. 14 15target triple = "x86_64-unknown-unknown" 16 17declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) 18declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) 19declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) 20 21define <4 x i32> @combine_pshufd1(<4 x i32> %a) { 22; CHECK-LABEL: combine_pshufd1: 23; CHECK: # %bb.0: # %entry 24; CHECK-NEXT: retq 25entry: 26 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 27 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) 28 ret <4 x i32> %c 29} 30 31define <4 x i32> @combine_pshufd2(<4 x i32> %a) { 32; CHECK-LABEL: combine_pshufd2: 33; CHECK: # %bb.0: # %entry 34; CHECK-NEXT: retq 35entry: 36 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 37 %b.cast = bitcast <4 x i32> %b to <8 x i16> 38 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) 39 %c.cast = bitcast <8 x i16> %c to <4 x i32> 40 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 41 ret <4 x i32> %d 42} 43 44define <4 x i32> @combine_pshufd3(<4 x i32> %a) { 45; CHECK-LABEL: combine_pshufd3: 46; CHECK: # %bb.0: # %entry 47; CHECK-NEXT: retq 48entry: 49 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) 50 %b.cast = bitcast <4 x i32> %b to <8 x i16> 51 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) 52 %c.cast = bitcast <8 x i16> %c to <4 x i32> 53 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) 54 ret <4 x i32> %d 55} 56 57define <4 x i32> @combine_pshufd4(<4 x i32> %a) { 58; SSE-LABEL: combine_pshufd4: 59; SSE: # %bb.0: # %entry 60; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 61; SSE-NEXT: retq 62; 63; AVX-LABEL: combine_pshufd4: 64; AVX: # %bb.0: # %entry 65; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 66; AVX-NEXT: retq 67entry: 68 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) 69 %b.cast = bitcast <4 x i32> %b to <8 x i16> 70 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) 71 %c.cast = bitcast <8 x i16> %c to <4 x i32> 72 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) 73 ret <4 x i32> %d 74} 75 76define <4 x i32> @combine_pshufd5(<4 x i32> %a) { 77; SSE-LABEL: combine_pshufd5: 78; SSE: # %bb.0: # %entry 79; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 80; SSE-NEXT: retq 81; 82; AVX-LABEL: combine_pshufd5: 83; AVX: # %bb.0: # %entry 84; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 85; AVX-NEXT: retq 86entry: 87 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) 88 %b.cast = bitcast <4 x i32> %b to <8 x i16> 89 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) 90 %c.cast = bitcast <8 x i16> %c to <4 x i32> 91 %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) 92 ret <4 x i32> %d 93} 94 95define <4 x i32> @combine_pshufd6(<4 x i32> %a) { 96; SSE-LABEL: combine_pshufd6: 97; SSE: # %bb.0: # %entry 98; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 99; SSE-NEXT: retq 100; 101; AVX1-LABEL: combine_pshufd6: 102; AVX1: # %bb.0: # %entry 103; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] 104; AVX1-NEXT: retq 105; 106; AVX2-LABEL: combine_pshufd6: 107; AVX2: # %bb.0: # %entry 108; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 109; AVX2-NEXT: retq 110entry: 111 %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 0) 112 %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 8) 113 ret <4 x i32> %c 114} 115 116define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { 117; CHECK-LABEL: combine_pshuflw1: 118; CHECK: # %bb.0: # %entry 119; CHECK-NEXT: retq 120entry: 121 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 122 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 123 ret <8 x i16> %c 124} 125 126define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { 127; CHECK-LABEL: combine_pshuflw2: 128; CHECK: # %bb.0: # %entry 129; CHECK-NEXT: retq 130entry: 131 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 132 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) 133 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 134 ret <8 x i16> %d 135} 136 137define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { 138; SSE-LABEL: combine_pshuflw3: 139; SSE: # %bb.0: # %entry 140; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 141; SSE-NEXT: retq 142; 143; AVX-LABEL: combine_pshuflw3: 144; AVX: # %bb.0: # %entry 145; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] 146; AVX-NEXT: retq 147entry: 148 %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) 149 %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) 150 %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) 151 ret <8 x i16> %d 152} 153 154define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { 155; SSE-LABEL: combine_pshufhw1: 156; SSE: # %bb.0: # %entry 157; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 158; SSE-NEXT: retq 159; 160; AVX-LABEL: combine_pshufhw1: 161; AVX: # %bb.0: # %entry 162; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] 163; AVX-NEXT: retq 164entry: 165 %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) 166 %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) 167 %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) 168 ret <8 x i16> %d 169} 170 171define <4 x i32> @combine_bitwise_ops_test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 172; SSE-LABEL: combine_bitwise_ops_test1: 173; SSE: # %bb.0: 174; SSE-NEXT: pand %xmm1, %xmm0 175; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 176; SSE-NEXT: retq 177; 178; AVX-LABEL: combine_bitwise_ops_test1: 179; AVX: # %bb.0: 180; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 181; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 182; AVX-NEXT: retq 183 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 184 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 185 %and = and <4 x i32> %shuf1, %shuf2 186 ret <4 x i32> %and 187} 188 189define <4 x i32> @combine_bitwise_ops_test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 190; SSE-LABEL: combine_bitwise_ops_test2: 191; SSE: # %bb.0: 192; SSE-NEXT: por %xmm1, %xmm0 193; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 194; SSE-NEXT: retq 195; 196; AVX-LABEL: combine_bitwise_ops_test2: 197; AVX: # %bb.0: 198; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 199; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 200; AVX-NEXT: retq 201 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 202 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 203 %or = or <4 x i32> %shuf1, %shuf2 204 ret <4 x i32> %or 205} 206 207define <4 x i32> @combine_bitwise_ops_test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 208; SSE-LABEL: combine_bitwise_ops_test3: 209; SSE: # %bb.0: 210; SSE-NEXT: pxor %xmm1, %xmm0 211; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 212; SSE-NEXT: retq 213; 214; AVX-LABEL: combine_bitwise_ops_test3: 215; AVX: # %bb.0: 216; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 217; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 218; AVX-NEXT: retq 219 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 220 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3> 221 %xor = xor <4 x i32> %shuf1, %shuf2 222 ret <4 x i32> %xor 223} 224 225define <4 x i32> @combine_bitwise_ops_test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 226; SSE-LABEL: combine_bitwise_ops_test4: 227; SSE: # %bb.0: 228; SSE-NEXT: pand %xmm1, %xmm0 229; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 230; SSE-NEXT: retq 231; 232; AVX-LABEL: combine_bitwise_ops_test4: 233; AVX: # %bb.0: 234; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 235; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 236; AVX-NEXT: retq 237 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 238 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 239 %and = and <4 x i32> %shuf1, %shuf2 240 ret <4 x i32> %and 241} 242 243define <4 x i32> @combine_bitwise_ops_test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 244; SSE-LABEL: combine_bitwise_ops_test5: 245; SSE: # %bb.0: 246; SSE-NEXT: por %xmm1, %xmm0 247; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 248; SSE-NEXT: retq 249; 250; AVX-LABEL: combine_bitwise_ops_test5: 251; AVX: # %bb.0: 252; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 253; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 254; AVX-NEXT: retq 255 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 256 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 257 %or = or <4 x i32> %shuf1, %shuf2 258 ret <4 x i32> %or 259} 260 261define <4 x i32> @combine_bitwise_ops_test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 262; SSE-LABEL: combine_bitwise_ops_test6: 263; SSE: # %bb.0: 264; SSE-NEXT: pxor %xmm1, %xmm0 265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 266; SSE-NEXT: retq 267; 268; AVX-LABEL: combine_bitwise_ops_test6: 269; AVX: # %bb.0: 270; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 271; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 272; AVX-NEXT: retq 273 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7> 274 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7> 275 %xor = xor <4 x i32> %shuf1, %shuf2 276 ret <4 x i32> %xor 277} 278 279 280; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles 281; are not performing a swizzle operations. 282 283define <4 x i32> @combine_bitwise_ops_test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 284; SSE2-LABEL: combine_bitwise_ops_test1b: 285; SSE2: # %bb.0: 286; SSE2-NEXT: pand %xmm1, %xmm0 287; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 288; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 289; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 290; SSE2-NEXT: retq 291; 292; SSSE3-LABEL: combine_bitwise_ops_test1b: 293; SSSE3: # %bb.0: 294; SSSE3-NEXT: pand %xmm1, %xmm0 295; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 296; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 297; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 298; SSSE3-NEXT: retq 299; 300; SSE41-LABEL: combine_bitwise_ops_test1b: 301; SSE41: # %bb.0: 302; SSE41-NEXT: andps %xmm1, %xmm0 303; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 304; SSE41-NEXT: retq 305; 306; AVX-LABEL: combine_bitwise_ops_test1b: 307; AVX: # %bb.0: 308; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 310; AVX-NEXT: retq 311 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 312 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 313 %and = and <4 x i32> %shuf1, %shuf2 314 ret <4 x i32> %and 315} 316 317define <4 x i32> @combine_bitwise_ops_test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 318; SSE2-LABEL: combine_bitwise_ops_test2b: 319; SSE2: # %bb.0: 320; SSE2-NEXT: por %xmm1, %xmm0 321; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 322; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 323; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 324; SSE2-NEXT: retq 325; 326; SSSE3-LABEL: combine_bitwise_ops_test2b: 327; SSSE3: # %bb.0: 328; SSSE3-NEXT: por %xmm1, %xmm0 329; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 330; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3] 331; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 332; SSSE3-NEXT: retq 333; 334; SSE41-LABEL: combine_bitwise_ops_test2b: 335; SSE41: # %bb.0: 336; SSE41-NEXT: orps %xmm1, %xmm0 337; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 338; SSE41-NEXT: retq 339; 340; AVX-LABEL: combine_bitwise_ops_test2b: 341; AVX: # %bb.0: 342; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 343; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] 344; AVX-NEXT: retq 345 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 346 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 347 %or = or <4 x i32> %shuf1, %shuf2 348 ret <4 x i32> %or 349} 350 351define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 352; SSE2-LABEL: combine_bitwise_ops_test3b: 353; SSE2: # %bb.0: 354; SSE2-NEXT: xorps %xmm1, %xmm0 355; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 356; SSE2-NEXT: retq 357; 358; SSSE3-LABEL: combine_bitwise_ops_test3b: 359; SSSE3: # %bb.0: 360; SSSE3-NEXT: xorps %xmm1, %xmm0 361; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 362; SSSE3-NEXT: retq 363; 364; SSE41-LABEL: combine_bitwise_ops_test3b: 365; SSE41: # %bb.0: 366; SSE41-NEXT: xorps %xmm1, %xmm0 367; SSE41-NEXT: xorps %xmm1, %xmm1 368; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 369; SSE41-NEXT: retq 370; 371; AVX-LABEL: combine_bitwise_ops_test3b: 372; AVX: # %bb.0: 373; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 374; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 375; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 376; AVX-NEXT: retq 377 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 378 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7> 379 %xor = xor <4 x i32> %shuf1, %shuf2 380 ret <4 x i32> %xor 381} 382 383define <4 x i32> @combine_bitwise_ops_test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 384; SSE2-LABEL: combine_bitwise_ops_test4b: 385; SSE2: # %bb.0: 386; SSE2-NEXT: pand %xmm1, %xmm0 387; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 388; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 389; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 390; SSE2-NEXT: retq 391; 392; SSSE3-LABEL: combine_bitwise_ops_test4b: 393; SSSE3: # %bb.0: 394; SSSE3-NEXT: pand %xmm1, %xmm0 395; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 396; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 397; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 398; SSSE3-NEXT: retq 399; 400; SSE41-LABEL: combine_bitwise_ops_test4b: 401; SSE41: # %bb.0: 402; SSE41-NEXT: andps %xmm1, %xmm0 403; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 404; SSE41-NEXT: retq 405; 406; AVX-LABEL: combine_bitwise_ops_test4b: 407; AVX: # %bb.0: 408; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 409; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 410; AVX-NEXT: retq 411 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 412 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 413 %and = and <4 x i32> %shuf1, %shuf2 414 ret <4 x i32> %and 415} 416 417define <4 x i32> @combine_bitwise_ops_test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 418; SSE2-LABEL: combine_bitwise_ops_test5b: 419; SSE2: # %bb.0: 420; SSE2-NEXT: por %xmm1, %xmm0 421; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 423; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 424; SSE2-NEXT: retq 425; 426; SSSE3-LABEL: combine_bitwise_ops_test5b: 427; SSSE3: # %bb.0: 428; SSSE3-NEXT: por %xmm1, %xmm0 429; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3] 430; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 431; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 432; SSSE3-NEXT: retq 433; 434; SSE41-LABEL: combine_bitwise_ops_test5b: 435; SSE41: # %bb.0: 436; SSE41-NEXT: orps %xmm1, %xmm0 437; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 438; SSE41-NEXT: retq 439; 440; AVX-LABEL: combine_bitwise_ops_test5b: 441; AVX: # %bb.0: 442; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 443; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] 444; AVX-NEXT: retq 445 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 446 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 447 %or = or <4 x i32> %shuf1, %shuf2 448 ret <4 x i32> %or 449} 450 451define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 452; SSE2-LABEL: combine_bitwise_ops_test6b: 453; SSE2: # %bb.0: 454; SSE2-NEXT: xorps %xmm1, %xmm0 455; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 456; SSE2-NEXT: retq 457; 458; SSSE3-LABEL: combine_bitwise_ops_test6b: 459; SSSE3: # %bb.0: 460; SSSE3-NEXT: xorps %xmm1, %xmm0 461; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 462; SSSE3-NEXT: retq 463; 464; SSE41-LABEL: combine_bitwise_ops_test6b: 465; SSE41: # %bb.0: 466; SSE41-NEXT: xorps %xmm1, %xmm0 467; SSE41-NEXT: xorps %xmm1, %xmm1 468; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 469; SSE41-NEXT: retq 470; 471; AVX-LABEL: combine_bitwise_ops_test6b: 472; AVX: # %bb.0: 473; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 474; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 475; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 476; AVX-NEXT: retq 477 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7> 478 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7> 479 %xor = xor <4 x i32> %shuf1, %shuf2 480 ret <4 x i32> %xor 481} 482 483define <4 x i32> @combine_bitwise_ops_test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 484; SSE-LABEL: combine_bitwise_ops_test1c: 485; SSE: # %bb.0: 486; SSE-NEXT: andps %xmm1, %xmm0 487; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 488; SSE-NEXT: retq 489; 490; AVX-LABEL: combine_bitwise_ops_test1c: 491; AVX: # %bb.0: 492; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 493; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 494; AVX-NEXT: retq 495 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 496 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 497 %and = and <4 x i32> %shuf1, %shuf2 498 ret <4 x i32> %and 499} 500 501define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 502; SSE-LABEL: combine_bitwise_ops_test2c: 503; SSE: # %bb.0: 504; SSE-NEXT: orps %xmm1, %xmm0 505; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 506; SSE-NEXT: retq 507; 508; AVX-LABEL: combine_bitwise_ops_test2c: 509; AVX: # %bb.0: 510; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 511; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[1,3] 512; AVX-NEXT: retq 513 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 514 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 515 %or = or <4 x i32> %shuf1, %shuf2 516 ret <4 x i32> %or 517} 518 519define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 520; SSE2-LABEL: combine_bitwise_ops_test3c: 521; SSE2: # %bb.0: 522; SSE2-NEXT: xorps %xmm1, %xmm0 523; SSE2-NEXT: xorps %xmm1, %xmm1 524; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 525; SSE2-NEXT: retq 526; 527; SSSE3-LABEL: combine_bitwise_ops_test3c: 528; SSSE3: # %bb.0: 529; SSSE3-NEXT: xorps %xmm1, %xmm0 530; SSSE3-NEXT: xorps %xmm1, %xmm1 531; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 532; SSSE3-NEXT: retq 533; 534; SSE41-LABEL: combine_bitwise_ops_test3c: 535; SSE41: # %bb.0: 536; SSE41-NEXT: xorps %xmm1, %xmm0 537; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 538; SSE41-NEXT: retq 539; 540; AVX-LABEL: combine_bitwise_ops_test3c: 541; AVX: # %bb.0: 542; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 543; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero 544; AVX-NEXT: retq 545 %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 546 %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7> 547 %xor = xor <4 x i32> %shuf1, %shuf2 548 ret <4 x i32> %xor 549} 550 551define <4 x i32> @combine_bitwise_ops_test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 552; SSE-LABEL: combine_bitwise_ops_test4c: 553; SSE: # %bb.0: 554; SSE-NEXT: andps %xmm1, %xmm0 555; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 556; SSE-NEXT: movaps %xmm2, %xmm0 557; SSE-NEXT: retq 558; 559; AVX-LABEL: combine_bitwise_ops_test4c: 560; AVX: # %bb.0: 561; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 562; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 563; AVX-NEXT: retq 564 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 565 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 566 %and = and <4 x i32> %shuf1, %shuf2 567 ret <4 x i32> %and 568} 569 570define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 571; SSE-LABEL: combine_bitwise_ops_test5c: 572; SSE: # %bb.0: 573; SSE-NEXT: orps %xmm1, %xmm0 574; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm0[1,3] 575; SSE-NEXT: movaps %xmm2, %xmm0 576; SSE-NEXT: retq 577; 578; AVX-LABEL: combine_bitwise_ops_test5c: 579; AVX: # %bb.0: 580; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 581; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] 582; AVX-NEXT: retq 583 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 584 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 585 %or = or <4 x i32> %shuf1, %shuf2 586 ret <4 x i32> %or 587} 588 589define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) { 590; SSE2-LABEL: combine_bitwise_ops_test6c: 591; SSE2: # %bb.0: 592; SSE2-NEXT: xorps %xmm1, %xmm0 593; SSE2-NEXT: xorps %xmm1, %xmm1 594; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 595; SSE2-NEXT: movaps %xmm1, %xmm0 596; SSE2-NEXT: retq 597; 598; SSSE3-LABEL: combine_bitwise_ops_test6c: 599; SSSE3: # %bb.0: 600; SSSE3-NEXT: xorps %xmm1, %xmm0 601; SSSE3-NEXT: xorps %xmm1, %xmm1 602; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3] 603; SSSE3-NEXT: movaps %xmm1, %xmm0 604; SSSE3-NEXT: retq 605; 606; SSE41-LABEL: combine_bitwise_ops_test6c: 607; SSE41: # %bb.0: 608; SSE41-NEXT: xorps %xmm1, %xmm0 609; SSE41-NEXT: insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 610; SSE41-NEXT: retq 611; 612; AVX-LABEL: combine_bitwise_ops_test6c: 613; AVX: # %bb.0: 614; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 615; AVX-NEXT: vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3] 616; AVX-NEXT: retq 617 %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7> 618 %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7> 619 %xor = xor <4 x i32> %shuf1, %shuf2 620 ret <4 x i32> %xor 621} 622 623define <4 x i32> @combine_nested_undef_test1(<4 x i32> %A, <4 x i32> %B) { 624; SSE-LABEL: combine_nested_undef_test1: 625; SSE: # %bb.0: 626; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 627; SSE-NEXT: retq 628; 629; AVX-LABEL: combine_nested_undef_test1: 630; AVX: # %bb.0: 631; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 632; AVX-NEXT: retq 633 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 634 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 635 ret <4 x i32> %2 636} 637 638define <4 x i32> @combine_nested_undef_test2(<4 x i32> %A, <4 x i32> %B) { 639; SSE-LABEL: combine_nested_undef_test2: 640; SSE: # %bb.0: 641; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 642; SSE-NEXT: retq 643; 644; AVX-LABEL: combine_nested_undef_test2: 645; AVX: # %bb.0: 646; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 647; AVX-NEXT: retq 648 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 649 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 650 ret <4 x i32> %2 651} 652 653define <4 x i32> @combine_nested_undef_test3(<4 x i32> %A, <4 x i32> %B) { 654; SSE-LABEL: combine_nested_undef_test3: 655; SSE: # %bb.0: 656; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] 657; SSE-NEXT: retq 658; 659; AVX-LABEL: combine_nested_undef_test3: 660; AVX: # %bb.0: 661; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] 662; AVX-NEXT: retq 663 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 3> 664 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 3> 665 ret <4 x i32> %2 666} 667 668define <4 x i32> @combine_nested_undef_test4(<4 x i32> %A, <4 x i32> %B) { 669; SSE-LABEL: combine_nested_undef_test4: 670; SSE: # %bb.0: 671; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 672; SSE-NEXT: retq 673; 674; AVX1-LABEL: combine_nested_undef_test4: 675; AVX1: # %bb.0: 676; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 677; AVX1-NEXT: retq 678; 679; AVX2-LABEL: combine_nested_undef_test4: 680; AVX2: # %bb.0: 681; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 682; AVX2-NEXT: retq 683 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 7, i32 1> 684 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 4, i32 0, i32 3> 685 ret <4 x i32> %2 686} 687 688define <4 x i32> @combine_nested_undef_test5(<4 x i32> %A, <4 x i32> %B) { 689; SSE-LABEL: combine_nested_undef_test5: 690; SSE: # %bb.0: 691; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 692; SSE-NEXT: retq 693; 694; AVX-LABEL: combine_nested_undef_test5: 695; AVX: # %bb.0: 696; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 697; AVX-NEXT: retq 698 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 5, i32 5, i32 2, i32 3> 699 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 4, i32 3> 700 ret <4 x i32> %2 701} 702 703define <4 x i32> @combine_nested_undef_test6(<4 x i32> %A, <4 x i32> %B) { 704; SSE-LABEL: combine_nested_undef_test6: 705; SSE: # %bb.0: 706; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 707; SSE-NEXT: retq 708; 709; AVX-LABEL: combine_nested_undef_test6: 710; AVX: # %bb.0: 711; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 712; AVX-NEXT: retq 713 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 714 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 4, i32 0, i32 4> 715 ret <4 x i32> %2 716} 717 718define <4 x i32> @combine_nested_undef_test7(<4 x i32> %A, <4 x i32> %B) { 719; SSE-LABEL: combine_nested_undef_test7: 720; SSE: # %bb.0: 721; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2] 722; SSE-NEXT: retq 723; 724; AVX-LABEL: combine_nested_undef_test7: 725; AVX: # %bb.0: 726; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] 727; AVX-NEXT: retq 728 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 729 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 730 ret <4 x i32> %2 731} 732 733define <4 x i32> @combine_nested_undef_test8(<4 x i32> %A, <4 x i32> %B) { 734; SSE-LABEL: combine_nested_undef_test8: 735; SSE: # %bb.0: 736; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 737; SSE-NEXT: retq 738; 739; AVX-LABEL: combine_nested_undef_test8: 740; AVX: # %bb.0: 741; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] 742; AVX-NEXT: retq 743 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 744 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 3, i32 4> 745 ret <4 x i32> %2 746} 747 748define <4 x i32> @combine_nested_undef_test9(<4 x i32> %A, <4 x i32> %B) { 749; SSE-LABEL: combine_nested_undef_test9: 750; SSE: # %bb.0: 751; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,2] 752; SSE-NEXT: retq 753; 754; AVX-LABEL: combine_nested_undef_test9: 755; AVX: # %bb.0: 756; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] 757; AVX-NEXT: retq 758 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 3, i32 2, i32 5> 759 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 2> 760 ret <4 x i32> %2 761} 762 763define <4 x i32> @combine_nested_undef_test10(<4 x i32> %A, <4 x i32> %B) { 764; SSE-LABEL: combine_nested_undef_test10: 765; SSE: # %bb.0: 766; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 767; SSE-NEXT: retq 768; 769; AVX-LABEL: combine_nested_undef_test10: 770; AVX: # %bb.0: 771; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] 772; AVX-NEXT: retq 773 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 1, i32 5, i32 5> 774 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 4> 775 ret <4 x i32> %2 776} 777 778define <4 x i32> @combine_nested_undef_test11(<4 x i32> %A, <4 x i32> %B) { 779; SSE-LABEL: combine_nested_undef_test11: 780; SSE: # %bb.0: 781; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,1] 782; SSE-NEXT: retq 783; 784; AVX-LABEL: combine_nested_undef_test11: 785; AVX: # %bb.0: 786; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] 787; AVX-NEXT: retq 788 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 2, i32 5, i32 4> 789 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 0> 790 ret <4 x i32> %2 791} 792 793define <4 x i32> @combine_nested_undef_test12(<4 x i32> %A, <4 x i32> %B) { 794; SSE-LABEL: combine_nested_undef_test12: 795; SSE: # %bb.0: 796; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 797; SSE-NEXT: retq 798; 799; AVX1-LABEL: combine_nested_undef_test12: 800; AVX1: # %bb.0: 801; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 802; AVX1-NEXT: retq 803; 804; AVX2-LABEL: combine_nested_undef_test12: 805; AVX2: # %bb.0: 806; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 807; AVX2-NEXT: retq 808 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 0, i32 2, i32 4> 809 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 4, i32 0, i32 4> 810 ret <4 x i32> %2 811} 812 813; The following pair of shuffles is folded into vector %A. 814define <4 x i32> @combine_nested_undef_test13(<4 x i32> %A, <4 x i32> %B) { 815; CHECK-LABEL: combine_nested_undef_test13: 816; CHECK: # %bb.0: 817; CHECK-NEXT: retq 818 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 4, i32 2, i32 6> 819 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 4, i32 0, i32 2, i32 4> 820 ret <4 x i32> %2 821} 822 823; The following pair of shuffles is folded into vector %B. 824define <4 x i32> @combine_nested_undef_test14(<4 x i32> %A, <4 x i32> %B) { 825; SSE-LABEL: combine_nested_undef_test14: 826; SSE: # %bb.0: 827; SSE-NEXT: movaps %xmm1, %xmm0 828; SSE-NEXT: retq 829; 830; AVX-LABEL: combine_nested_undef_test14: 831; AVX: # %bb.0: 832; AVX-NEXT: vmovaps %xmm1, %xmm0 833; AVX-NEXT: retq 834 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 6, i32 2, i32 4> 835 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 4, i32 1, i32 4> 836 ret <4 x i32> %2 837} 838 839 840; Verify that we don't optimize the following cases. We expect more than one shuffle. 841; 842; FIXME: Many of these already don't make sense, and the rest should stop 843; making sense with th enew vector shuffle lowering. Revisit at least testing for 844; it. 845 846define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { 847; SSE2-LABEL: combine_nested_undef_test15: 848; SSE2: # %bb.0: 849; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 850; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 851; SSE2-NEXT: movaps %xmm1, %xmm0 852; SSE2-NEXT: retq 853; 854; SSSE3-LABEL: combine_nested_undef_test15: 855; SSSE3: # %bb.0: 856; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 857; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,1] 858; SSSE3-NEXT: movaps %xmm1, %xmm0 859; SSSE3-NEXT: retq 860; 861; SSE41-LABEL: combine_nested_undef_test15: 862; SSE41: # %bb.0: 863; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] 864; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 865; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 866; SSE41-NEXT: retq 867; 868; AVX1-LABEL: combine_nested_undef_test15: 869; AVX1: # %bb.0: 870; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] 871; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 872; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 873; AVX1-NEXT: retq 874; 875; AVX2-LABEL: combine_nested_undef_test15: 876; AVX2: # %bb.0: 877; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 878; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 879; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 880; AVX2-NEXT: retq 881 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 3, i32 1> 882 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 883 ret <4 x i32> %2 884} 885 886define <4 x i32> @combine_nested_undef_test16(<4 x i32> %A, <4 x i32> %B) { 887; SSE2-LABEL: combine_nested_undef_test16: 888; SSE2: # %bb.0: 889; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 890; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 891; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 892; SSE2-NEXT: retq 893; 894; SSSE3-LABEL: combine_nested_undef_test16: 895; SSSE3: # %bb.0: 896; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] 897; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3] 898; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 899; SSSE3-NEXT: retq 900; 901; SSE41-LABEL: combine_nested_undef_test16: 902; SSE41: # %bb.0: 903; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 904; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 905; SSE41-NEXT: retq 906; 907; AVX-LABEL: combine_nested_undef_test16: 908; AVX: # %bb.0: 909; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] 910; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 911; AVX-NEXT: retq 912 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 913 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 914 ret <4 x i32> %2 915} 916 917define <4 x i32> @combine_nested_undef_test17(<4 x i32> %A, <4 x i32> %B) { 918; SSE2-LABEL: combine_nested_undef_test17: 919; SSE2: # %bb.0: 920; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 921; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 922; SSE2-NEXT: retq 923; 924; SSSE3-LABEL: combine_nested_undef_test17: 925; SSSE3: # %bb.0: 926; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[1,0] 927; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[0,2] 928; SSSE3-NEXT: retq 929; 930; SSE41-LABEL: combine_nested_undef_test17: 931; SSE41: # %bb.0: 932; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 933; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,0,1] 934; SSE41-NEXT: retq 935; 936; AVX-LABEL: combine_nested_undef_test17: 937; AVX: # %bb.0: 938; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 939; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] 940; AVX-NEXT: retq 941 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 942 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 943 ret <4 x i32> %2 944} 945 946define <4 x i32> @combine_nested_undef_test18(<4 x i32> %A, <4 x i32> %B) { 947; SSE-LABEL: combine_nested_undef_test18: 948; SSE: # %bb.0: 949; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,3] 950; SSE-NEXT: retq 951; 952; AVX-LABEL: combine_nested_undef_test18: 953; AVX: # %bb.0: 954; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] 955; AVX-NEXT: retq 956 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 957 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 3> 958 ret <4 x i32> %2 959} 960 961define <4 x i32> @combine_nested_undef_test19(<4 x i32> %A, <4 x i32> %B) { 962; SSE2-LABEL: combine_nested_undef_test19: 963; SSE2: # %bb.0: 964; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 965; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 966; SSE2-NEXT: retq 967; 968; SSSE3-LABEL: combine_nested_undef_test19: 969; SSSE3: # %bb.0: 970; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 971; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,0,0,0] 972; SSSE3-NEXT: retq 973; 974; SSE41-LABEL: combine_nested_undef_test19: 975; SSE41: # %bb.0: 976; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] 977; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,0] 978; SSE41-NEXT: retq 979; 980; AVX-LABEL: combine_nested_undef_test19: 981; AVX: # %bb.0: 982; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 983; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] 984; AVX-NEXT: retq 985 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 0, i32 4, i32 5, i32 6> 986 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 0, i32 0, i32 0> 987 ret <4 x i32> %2 988} 989 990define <4 x i32> @combine_nested_undef_test20(<4 x i32> %A, <4 x i32> %B) { 991; SSE2-LABEL: combine_nested_undef_test20: 992; SSE2: # %bb.0: 993; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 994; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 995; SSE2-NEXT: movaps %xmm1, %xmm0 996; SSE2-NEXT: retq 997; 998; SSSE3-LABEL: combine_nested_undef_test20: 999; SSSE3: # %bb.0: 1000; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 1001; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] 1002; SSSE3-NEXT: movaps %xmm1, %xmm0 1003; SSSE3-NEXT: retq 1004; 1005; SSE41-LABEL: combine_nested_undef_test20: 1006; SSE41: # %bb.0: 1007; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 1008; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,3,0] 1009; SSE41-NEXT: retq 1010; 1011; AVX-LABEL: combine_nested_undef_test20: 1012; AVX: # %bb.0: 1013; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1014; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] 1015; AVX-NEXT: retq 1016 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 3, i32 2, i32 4, i32 4> 1017 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3> 1018 ret <4 x i32> %2 1019} 1020 1021define <4 x i32> @combine_nested_undef_test21(<4 x i32> %A, <4 x i32> %B) { 1022; SSE2-LABEL: combine_nested_undef_test21: 1023; SSE2: # %bb.0: 1024; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1025; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1026; SSE2-NEXT: retq 1027; 1028; SSSE3-LABEL: combine_nested_undef_test21: 1029; SSSE3: # %bb.0: 1030; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1031; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,0,3] 1032; SSSE3-NEXT: retq 1033; 1034; SSE41-LABEL: combine_nested_undef_test21: 1035; SSE41: # %bb.0: 1036; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1037; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1038; SSE41-NEXT: retq 1039; 1040; AVX1-LABEL: combine_nested_undef_test21: 1041; AVX1: # %bb.0: 1042; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1043; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1044; AVX1-NEXT: retq 1045; 1046; AVX2-LABEL: combine_nested_undef_test21: 1047; AVX2: # %bb.0: 1048; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1049; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1050; AVX2-NEXT: retq 1051 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 3, i32 1> 1052 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1053 ret <4 x i32> %2 1054} 1055 1056 1057; Test that we correctly combine shuffles according to rule 1058; shuffle(shuffle(x, y), undef) -> shuffle(y, undef) 1059 1060define <4 x i32> @combine_nested_undef_test22(<4 x i32> %A, <4 x i32> %B) { 1061; SSE-LABEL: combine_nested_undef_test22: 1062; SSE: # %bb.0: 1063; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,3] 1064; SSE-NEXT: retq 1065; 1066; AVX-LABEL: combine_nested_undef_test22: 1067; AVX: # %bb.0: 1068; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] 1069; AVX-NEXT: retq 1070 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1071 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 3> 1072 ret <4 x i32> %2 1073} 1074 1075define <4 x i32> @combine_nested_undef_test23(<4 x i32> %A, <4 x i32> %B) { 1076; SSE-LABEL: combine_nested_undef_test23: 1077; SSE: # %bb.0: 1078; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] 1079; SSE-NEXT: retq 1080; 1081; AVX-LABEL: combine_nested_undef_test23: 1082; AVX: # %bb.0: 1083; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] 1084; AVX-NEXT: retq 1085 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 5, i32 2, i32 7> 1086 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 3> 1087 ret <4 x i32> %2 1088} 1089 1090define <4 x i32> @combine_nested_undef_test24(<4 x i32> %A, <4 x i32> %B) { 1091; SSE-LABEL: combine_nested_undef_test24: 1092; SSE: # %bb.0: 1093; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,3,2,3] 1094; SSE-NEXT: retq 1095; 1096; AVX-LABEL: combine_nested_undef_test24: 1097; AVX: # %bb.0: 1098; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] 1099; AVX-NEXT: retq 1100 %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1101 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 4> 1102 ret <4 x i32> %2 1103} 1104 1105define <4 x i32> @combine_nested_undef_test25(<4 x i32> %A, <4 x i32> %B) { 1106; SSE-LABEL: combine_nested_undef_test25: 1107; SSE: # %bb.0: 1108; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1109; SSE-NEXT: retq 1110; 1111; AVX1-LABEL: combine_nested_undef_test25: 1112; AVX1: # %bb.0: 1113; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1114; AVX1-NEXT: retq 1115; 1116; AVX2-LABEL: combine_nested_undef_test25: 1117; AVX2: # %bb.0: 1118; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1119; AVX2-NEXT: retq 1120 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 5, i32 2, i32 4> 1121 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 1, i32 3, i32 1> 1122 ret <4 x i32> %2 1123} 1124 1125define <4 x i32> @combine_nested_undef_test26(<4 x i32> %A, <4 x i32> %B) { 1126; SSE-LABEL: combine_nested_undef_test26: 1127; SSE: # %bb.0: 1128; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 1129; SSE-NEXT: retq 1130; 1131; AVX-LABEL: combine_nested_undef_test26: 1132; AVX: # %bb.0: 1133; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] 1134; AVX-NEXT: retq 1135 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 6, i32 7> 1136 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 2, i32 3> 1137 ret <4 x i32> %2 1138} 1139 1140define <4 x i32> @combine_nested_undef_test27(<4 x i32> %A, <4 x i32> %B) { 1141; SSE-LABEL: combine_nested_undef_test27: 1142; SSE: # %bb.0: 1143; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 1144; SSE-NEXT: retq 1145; 1146; AVX1-LABEL: combine_nested_undef_test27: 1147; AVX1: # %bb.0: 1148; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] 1149; AVX1-NEXT: retq 1150; 1151; AVX2-LABEL: combine_nested_undef_test27: 1152; AVX2: # %bb.0: 1153; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 1154; AVX2-NEXT: retq 1155 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 2, i32 1, i32 5, i32 4> 1156 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2> 1157 ret <4 x i32> %2 1158} 1159 1160define <4 x i32> @combine_nested_undef_test28(<4 x i32> %A, <4 x i32> %B) { 1161; SSE-LABEL: combine_nested_undef_test28: 1162; SSE: # %bb.0: 1163; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] 1164; SSE-NEXT: retq 1165; 1166; AVX-LABEL: combine_nested_undef_test28: 1167; AVX: # %bb.0: 1168; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] 1169; AVX-NEXT: retq 1170 %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 1171 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 3, i32 2> 1172 ret <4 x i32> %2 1173} 1174 1175define <4 x float> @combine_test1(<4 x float> %a, <4 x float> %b) { 1176; SSE-LABEL: combine_test1: 1177; SSE: # %bb.0: 1178; SSE-NEXT: movaps %xmm1, %xmm0 1179; SSE-NEXT: retq 1180; 1181; AVX-LABEL: combine_test1: 1182; AVX: # %bb.0: 1183; AVX-NEXT: vmovaps %xmm1, %xmm0 1184; AVX-NEXT: retq 1185 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1186 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1187 ret <4 x float> %2 1188} 1189 1190define <4 x float> @combine_test2(<4 x float> %a, <4 x float> %b) { 1191; SSE2-LABEL: combine_test2: 1192; SSE2: # %bb.0: 1193; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1194; SSE2-NEXT: movaps %xmm1, %xmm0 1195; SSE2-NEXT: retq 1196; 1197; SSSE3-LABEL: combine_test2: 1198; SSSE3: # %bb.0: 1199; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1200; SSSE3-NEXT: movaps %xmm1, %xmm0 1201; SSSE3-NEXT: retq 1202; 1203; SSE41-LABEL: combine_test2: 1204; SSE41: # %bb.0: 1205; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1206; SSE41-NEXT: retq 1207; 1208; AVX-LABEL: combine_test2: 1209; AVX: # %bb.0: 1210; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1211; AVX-NEXT: retq 1212 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1213 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1214 ret <4 x float> %2 1215} 1216 1217define <4 x float> @combine_test3(<4 x float> %a, <4 x float> %b) { 1218; SSE-LABEL: combine_test3: 1219; SSE: # %bb.0: 1220; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1221; SSE-NEXT: retq 1222; 1223; AVX-LABEL: combine_test3: 1224; AVX: # %bb.0: 1225; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1226; AVX-NEXT: retq 1227 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1228 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1229 ret <4 x float> %2 1230} 1231 1232define <4 x float> @combine_test4(<4 x float> %a, <4 x float> %b) { 1233; SSE-LABEL: combine_test4: 1234; SSE: # %bb.0: 1235; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1236; SSE-NEXT: retq 1237; 1238; AVX-LABEL: combine_test4: 1239; AVX: # %bb.0: 1240; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1241; AVX-NEXT: retq 1242 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1243 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1244 ret <4 x float> %2 1245} 1246 1247define <4 x float> @combine_test5(<4 x float> %a, <4 x float> %b) { 1248; SSE2-LABEL: combine_test5: 1249; SSE2: # %bb.0: 1250; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1251; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1252; SSE2-NEXT: retq 1253; 1254; SSSE3-LABEL: combine_test5: 1255; SSSE3: # %bb.0: 1256; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1257; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1258; SSSE3-NEXT: retq 1259; 1260; SSE41-LABEL: combine_test5: 1261; SSE41: # %bb.0: 1262; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1263; SSE41-NEXT: retq 1264; 1265; AVX-LABEL: combine_test5: 1266; AVX: # %bb.0: 1267; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1268; AVX-NEXT: retq 1269 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1270 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1271 ret <4 x float> %2 1272} 1273 1274define <4 x i32> @combine_test6(<4 x i32> %a, <4 x i32> %b) { 1275; SSE-LABEL: combine_test6: 1276; SSE: # %bb.0: 1277; SSE-NEXT: movaps %xmm1, %xmm0 1278; SSE-NEXT: retq 1279; 1280; AVX-LABEL: combine_test6: 1281; AVX: # %bb.0: 1282; AVX-NEXT: vmovaps %xmm1, %xmm0 1283; AVX-NEXT: retq 1284 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1285 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1286 ret <4 x i32> %2 1287} 1288 1289define <4 x i32> @combine_test7(<4 x i32> %a, <4 x i32> %b) { 1290; SSE2-LABEL: combine_test7: 1291; SSE2: # %bb.0: 1292; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1293; SSE2-NEXT: movaps %xmm1, %xmm0 1294; SSE2-NEXT: retq 1295; 1296; SSSE3-LABEL: combine_test7: 1297; SSSE3: # %bb.0: 1298; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1299; SSSE3-NEXT: movaps %xmm1, %xmm0 1300; SSSE3-NEXT: retq 1301; 1302; SSE41-LABEL: combine_test7: 1303; SSE41: # %bb.0: 1304; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1305; SSE41-NEXT: retq 1306; 1307; AVX-LABEL: combine_test7: 1308; AVX: # %bb.0: 1309; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1310; AVX-NEXT: retq 1311 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1312 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1313 ret <4 x i32> %2 1314} 1315 1316define <4 x i32> @combine_test8(<4 x i32> %a, <4 x i32> %b) { 1317; SSE-LABEL: combine_test8: 1318; SSE: # %bb.0: 1319; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1320; SSE-NEXT: retq 1321; 1322; AVX-LABEL: combine_test8: 1323; AVX: # %bb.0: 1324; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1325; AVX-NEXT: retq 1326 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 1327 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1328 ret <4 x i32> %2 1329} 1330 1331define <4 x i32> @combine_test9(<4 x i32> %a, <4 x i32> %b) { 1332; SSE-LABEL: combine_test9: 1333; SSE: # %bb.0: 1334; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1335; SSE-NEXT: movaps %xmm1, %xmm0 1336; SSE-NEXT: retq 1337; 1338; AVX-LABEL: combine_test9: 1339; AVX: # %bb.0: 1340; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1341; AVX-NEXT: retq 1342 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1343 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1344 ret <4 x i32> %2 1345} 1346 1347define <4 x i32> @combine_test10(<4 x i32> %a, <4 x i32> %b) { 1348; SSE2-LABEL: combine_test10: 1349; SSE2: # %bb.0: 1350; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1351; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1352; SSE2-NEXT: retq 1353; 1354; SSSE3-LABEL: combine_test10: 1355; SSSE3: # %bb.0: 1356; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1357; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1358; SSSE3-NEXT: retq 1359; 1360; SSE41-LABEL: combine_test10: 1361; SSE41: # %bb.0: 1362; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1363; SSE41-NEXT: retq 1364; 1365; AVX-LABEL: combine_test10: 1366; AVX: # %bb.0: 1367; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1368; AVX-NEXT: retq 1369 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1370 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1371 ret <4 x i32> %2 1372} 1373 1374define <4 x float> @combine_test11(<4 x float> %a, <4 x float> %b) { 1375; CHECK-LABEL: combine_test11: 1376; CHECK: # %bb.0: 1377; CHECK-NEXT: retq 1378 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1379 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1380 ret <4 x float> %2 1381} 1382 1383define <4 x float> @combine_test12(<4 x float> %a, <4 x float> %b) { 1384; SSE2-LABEL: combine_test12: 1385; SSE2: # %bb.0: 1386; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1387; SSE2-NEXT: movaps %xmm1, %xmm0 1388; SSE2-NEXT: retq 1389; 1390; SSSE3-LABEL: combine_test12: 1391; SSSE3: # %bb.0: 1392; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1393; SSSE3-NEXT: movaps %xmm1, %xmm0 1394; SSSE3-NEXT: retq 1395; 1396; SSE41-LABEL: combine_test12: 1397; SSE41: # %bb.0: 1398; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1399; SSE41-NEXT: retq 1400; 1401; AVX-LABEL: combine_test12: 1402; AVX: # %bb.0: 1403; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1404; AVX-NEXT: retq 1405 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1406 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1407 ret <4 x float> %2 1408} 1409 1410define <4 x float> @combine_test13(<4 x float> %a, <4 x float> %b) { 1411; SSE-LABEL: combine_test13: 1412; SSE: # %bb.0: 1413; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1414; SSE-NEXT: retq 1415; 1416; AVX-LABEL: combine_test13: 1417; AVX: # %bb.0: 1418; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1419; AVX-NEXT: retq 1420 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1421 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1422 ret <4 x float> %2 1423} 1424 1425define <4 x float> @combine_test14(<4 x float> %a, <4 x float> %b) { 1426; SSE-LABEL: combine_test14: 1427; SSE: # %bb.0: 1428; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1429; SSE-NEXT: retq 1430; 1431; AVX-LABEL: combine_test14: 1432; AVX: # %bb.0: 1433; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1434; AVX-NEXT: retq 1435 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1436 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1437 ret <4 x float> %2 1438} 1439 1440define <4 x float> @combine_test15(<4 x float> %a, <4 x float> %b) { 1441; SSE2-LABEL: combine_test15: 1442; SSE2: # %bb.0: 1443; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1444; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1445; SSE2-NEXT: retq 1446; 1447; SSSE3-LABEL: combine_test15: 1448; SSSE3: # %bb.0: 1449; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1450; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1451; SSSE3-NEXT: retq 1452; 1453; SSE41-LABEL: combine_test15: 1454; SSE41: # %bb.0: 1455; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1456; SSE41-NEXT: retq 1457; 1458; AVX-LABEL: combine_test15: 1459; AVX: # %bb.0: 1460; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1461; AVX-NEXT: retq 1462 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1463 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1464 ret <4 x float> %2 1465} 1466 1467define <4 x i32> @combine_test16(<4 x i32> %a, <4 x i32> %b) { 1468; CHECK-LABEL: combine_test16: 1469; CHECK: # %bb.0: 1470; CHECK-NEXT: retq 1471 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1472 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1473 ret <4 x i32> %2 1474} 1475 1476define <4 x i32> @combine_test17(<4 x i32> %a, <4 x i32> %b) { 1477; SSE2-LABEL: combine_test17: 1478; SSE2: # %bb.0: 1479; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1480; SSE2-NEXT: movaps %xmm1, %xmm0 1481; SSE2-NEXT: retq 1482; 1483; SSSE3-LABEL: combine_test17: 1484; SSSE3: # %bb.0: 1485; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1486; SSSE3-NEXT: movaps %xmm1, %xmm0 1487; SSSE3-NEXT: retq 1488; 1489; SSE41-LABEL: combine_test17: 1490; SSE41: # %bb.0: 1491; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1492; SSE41-NEXT: retq 1493; 1494; AVX-LABEL: combine_test17: 1495; AVX: # %bb.0: 1496; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1497; AVX-NEXT: retq 1498 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 1499 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 1500 ret <4 x i32> %2 1501} 1502 1503define <4 x i32> @combine_test18(<4 x i32> %a, <4 x i32> %b) { 1504; SSE-LABEL: combine_test18: 1505; SSE: # %bb.0: 1506; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1507; SSE-NEXT: retq 1508; 1509; AVX-LABEL: combine_test18: 1510; AVX: # %bb.0: 1511; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1512; AVX-NEXT: retq 1513 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1514 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 2, i32 3> 1515 ret <4 x i32> %2 1516} 1517 1518define <4 x i32> @combine_test19(<4 x i32> %a, <4 x i32> %b) { 1519; SSE-LABEL: combine_test19: 1520; SSE: # %bb.0: 1521; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1522; SSE-NEXT: retq 1523; 1524; AVX-LABEL: combine_test19: 1525; AVX: # %bb.0: 1526; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1527; AVX-NEXT: retq 1528 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 6, i32 7, i32 5, i32 5> 1529 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1530 ret <4 x i32> %2 1531} 1532 1533define <4 x i32> @combine_test20(<4 x i32> %a, <4 x i32> %b) { 1534; SSE2-LABEL: combine_test20: 1535; SSE2: # %bb.0: 1536; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1537; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1538; SSE2-NEXT: retq 1539; 1540; SSSE3-LABEL: combine_test20: 1541; SSSE3: # %bb.0: 1542; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 1543; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 1544; SSSE3-NEXT: retq 1545; 1546; SSE41-LABEL: combine_test20: 1547; SSE41: # %bb.0: 1548; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1549; SSE41-NEXT: retq 1550; 1551; AVX-LABEL: combine_test20: 1552; AVX: # %bb.0: 1553; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 1554; AVX-NEXT: retq 1555 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 7> 1556 %2 = shufflevector <4 x i32> %1, <4 x i32> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1557 ret <4 x i32> %2 1558} 1559 1560define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) { 1561; SSE-LABEL: combine_test21: 1562; SSE: # %bb.0: 1563; SSE-NEXT: movaps %xmm0, %xmm2 1564; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1565; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1566; SSE-NEXT: movaps %xmm2, (%rdi) 1567; SSE-NEXT: retq 1568; 1569; AVX-LABEL: combine_test21: 1570; AVX: # %bb.0: 1571; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 1572; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] 1573; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 1574; AVX-NEXT: vmovaps %xmm2, (%rdi) 1575; AVX-NEXT: vzeroupper 1576; AVX-NEXT: retq 1577 %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5> 1578 %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7> 1579 store <4 x i32> %1, ptr %ptr, align 16 1580 ret <4 x i32> %2 1581} 1582 1583define <8 x float> @combine_test22(ptr %a, ptr %b) { 1584; SSE-LABEL: combine_test22: 1585; SSE: # %bb.0: 1586; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 1587; SSE-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1588; SSE-NEXT: retq 1589; 1590; AVX-LABEL: combine_test22: 1591; AVX: # %bb.0: 1592; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 1593; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] 1594; AVX-NEXT: retq 1595; Current AVX2 lowering of this is still awful, not adding a test case. 1596 %1 = load <2 x float>, ptr %a, align 8 1597 %2 = load <2 x float>, ptr %b, align 8 1598 %3 = shufflevector <2 x float> %1, <2 x float> %2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef> 1599 ret <8 x float> %3 1600} 1601 1602; PR22359 1603define void @combine_test23(<8 x float> %v, ptr %ptr) { 1604; SSE-LABEL: combine_test23: 1605; SSE: # %bb.0: 1606; SSE-NEXT: movups %xmm0, (%rdi) 1607; SSE-NEXT: retq 1608; 1609; AVX-LABEL: combine_test23: 1610; AVX: # %bb.0: 1611; AVX-NEXT: vmovups %xmm0, (%rdi) 1612; AVX-NEXT: vzeroupper 1613; AVX-NEXT: retq 1614 %idx2 = getelementptr inbounds <2 x float>, ptr %ptr, i64 1 1615 %shuffle0 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 0, i32 1> 1616 %shuffle1 = shufflevector <8 x float> %v, <8 x float> undef, <2 x i32> <i32 2, i32 3> 1617 store <2 x float> %shuffle0, ptr %ptr, align 8 1618 store <2 x float> %shuffle1, ptr %idx2, align 8 1619 ret void 1620} 1621 1622; Check some negative cases. 1623; FIXME: Do any of these really make sense? Are they redundant with the above tests? 1624 1625define <4 x float> @combine_test1b(<4 x float> %a, <4 x float> %b) { 1626; SSE-LABEL: combine_test1b: 1627; SSE: # %bb.0: 1628; SSE-NEXT: movaps %xmm1, %xmm0 1629; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 1630; SSE-NEXT: retq 1631; 1632; AVX-LABEL: combine_test1b: 1633; AVX: # %bb.0: 1634; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] 1635; AVX-NEXT: retq 1636 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1637 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 0> 1638 ret <4 x float> %2 1639} 1640 1641define <4 x float> @combine_test2b(<4 x float> %a, <4 x float> %b) { 1642; SSE2-LABEL: combine_test2b: 1643; SSE2: # %bb.0: 1644; SSE2-NEXT: movaps %xmm1, %xmm0 1645; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1646; SSE2-NEXT: retq 1647; 1648; SSSE3-LABEL: combine_test2b: 1649; SSSE3: # %bb.0: 1650; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1651; SSSE3-NEXT: retq 1652; 1653; SSE41-LABEL: combine_test2b: 1654; SSE41: # %bb.0: 1655; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] 1656; SSE41-NEXT: retq 1657; 1658; AVX-LABEL: combine_test2b: 1659; AVX: # %bb.0: 1660; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm1[0,0] 1661; AVX-NEXT: retq 1662 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1663 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 0, i32 5> 1664 ret <4 x float> %2 1665} 1666 1667define <4 x float> @combine_test3b(<4 x float> %a, <4 x float> %b) { 1668; SSE2-LABEL: combine_test3b: 1669; SSE2: # %bb.0: 1670; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1671; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1672; SSE2-NEXT: retq 1673; 1674; SSSE3-LABEL: combine_test3b: 1675; SSSE3: # %bb.0: 1676; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0] 1677; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] 1678; SSSE3-NEXT: retq 1679; 1680; SSE41-LABEL: combine_test3b: 1681; SSE41: # %bb.0: 1682; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1683; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1684; SSE41-NEXT: retq 1685; 1686; AVX-LABEL: combine_test3b: 1687; AVX: # %bb.0: 1688; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 1689; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] 1690; AVX-NEXT: retq 1691 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 0, i32 6, i32 3> 1692 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 7> 1693 ret <4 x float> %2 1694} 1695 1696define <4 x float> @combine_test4b(<4 x float> %a, <4 x float> %b) { 1697; SSE-LABEL: combine_test4b: 1698; SSE: # %bb.0: 1699; SSE-NEXT: movaps %xmm1, %xmm0 1700; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3] 1701; SSE-NEXT: retq 1702; 1703; AVX-LABEL: combine_test4b: 1704; AVX: # %bb.0: 1705; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] 1706; AVX-NEXT: retq 1707 %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1708 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 5, i32 5, i32 2, i32 7> 1709 ret <4 x float> %2 1710} 1711 1712 1713; Verify that we correctly fold shuffles even when we use illegal vector types. 1714 1715define <4 x i8> @combine_test1c(ptr %a, ptr %b) { 1716; SSE2-LABEL: combine_test1c: 1717; SSE2: # %bb.0: 1718; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1719; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1720; SSE2-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1721; SSE2-NEXT: andps %xmm0, %xmm2 1722; SSE2-NEXT: andnps %xmm1, %xmm0 1723; SSE2-NEXT: orps %xmm2, %xmm0 1724; SSE2-NEXT: retq 1725; 1726; SSSE3-LABEL: combine_test1c: 1727; SSSE3: # %bb.0: 1728; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1729; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1730; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1731; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1732; SSSE3-NEXT: retq 1733; 1734; SSE41-LABEL: combine_test1c: 1735; SSE41: # %bb.0: 1736; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1737; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1738; SSE41-NEXT: movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1739; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1740; SSE41-NEXT: movdqa %xmm1, %xmm0 1741; SSE41-NEXT: retq 1742; 1743; AVX-LABEL: combine_test1c: 1744; AVX: # %bb.0: 1745; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1746; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1747; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1748; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1749; AVX-NEXT: retq 1750 %A = load <4 x i8>, ptr %a 1751 %B = load <4 x i8>, ptr %b 1752 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 1753 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1754 ret <4 x i8> %2 1755} 1756 1757define <4 x i8> @combine_test2c(ptr %a, ptr %b) { 1758; SSE-LABEL: combine_test2c: 1759; SSE: # %bb.0: 1760; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1761; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1762; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1763; SSE-NEXT: retq 1764; 1765; AVX-LABEL: combine_test2c: 1766; AVX: # %bb.0: 1767; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1768; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1769; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1770; AVX-NEXT: retq 1771 %A = load <4 x i8>, ptr %a 1772 %B = load <4 x i8>, ptr %b 1773 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 1, i32 5> 1774 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 1775 ret <4 x i8> %2 1776} 1777 1778define <4 x i8> @combine_test3c(ptr %a, ptr %b) { 1779; SSE-LABEL: combine_test3c: 1780; SSE: # %bb.0: 1781; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1782; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1783; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1784; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] 1785; SSE-NEXT: retq 1786; 1787; AVX-LABEL: combine_test3c: 1788; AVX: # %bb.0: 1789; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1790; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1791; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1792; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 1793; AVX-NEXT: retq 1794 %A = load <4 x i8>, ptr %a 1795 %B = load <4 x i8>, ptr %b 1796 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 1797 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 1798 ret <4 x i8> %2 1799} 1800 1801define <4 x i8> @combine_test4c(ptr %a, ptr %b) { 1802; SSE2-LABEL: combine_test4c: 1803; SSE2: # %bb.0: 1804; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1805; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 1806; SSE2-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1807; SSE2-NEXT: andps %xmm0, %xmm2 1808; SSE2-NEXT: andnps %xmm1, %xmm0 1809; SSE2-NEXT: orps %xmm2, %xmm0 1810; SSE2-NEXT: retq 1811; 1812; SSSE3-LABEL: combine_test4c: 1813; SSSE3: # %bb.0: 1814; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1815; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1816; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1817; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,3,4,6,u,u,u,u,u,u,u,u,u,u,u,u] 1818; SSSE3-NEXT: retq 1819; 1820; SSE41-LABEL: combine_test4c: 1821; SSE41: # %bb.0: 1822; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1823; SSE41-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 1824; SSE41-NEXT: movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1825; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 1826; SSE41-NEXT: movdqa %xmm1, %xmm0 1827; SSE41-NEXT: retq 1828; 1829; AVX-LABEL: combine_test4c: 1830; AVX: # %bb.0: 1831; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1832; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1833; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u> 1834; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 1835; AVX-NEXT: retq 1836 %A = load <4 x i8>, ptr %a 1837 %B = load <4 x i8>, ptr %b 1838 %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3> 1839 %2 = shufflevector <4 x i8> %1, <4 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1840 ret <4 x i8> %2 1841} 1842 1843 1844; The following test cases are generated from this C++ code 1845; 1846;__m128 blend_01(__m128 a, __m128 b) 1847;{ 1848; __m128 s = a; 1849; s = _mm_blend_ps( s, b, 1<<0 ); 1850; s = _mm_blend_ps( s, b, 1<<1 ); 1851; return s; 1852;} 1853; 1854;__m128 blend_02(__m128 a, __m128 b) 1855;{ 1856; __m128 s = a; 1857; s = _mm_blend_ps( s, b, 1<<0 ); 1858; s = _mm_blend_ps( s, b, 1<<2 ); 1859; return s; 1860;} 1861; 1862;__m128 blend_123(__m128 a, __m128 b) 1863;{ 1864; __m128 s = a; 1865; s = _mm_blend_ps( s, b, 1<<1 ); 1866; s = _mm_blend_ps( s, b, 1<<2 ); 1867; s = _mm_blend_ps( s, b, 1<<3 ); 1868; return s; 1869;} 1870 1871; Ideally, we should collapse the following shuffles into a single one. 1872 1873define <4 x float> @combine_blend_01(<4 x float> %a, <4 x float> %b) { 1874; SSE2-LABEL: combine_blend_01: 1875; SSE2: # %bb.0: 1876; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1877; SSE2-NEXT: retq 1878; 1879; SSSE3-LABEL: combine_blend_01: 1880; SSSE3: # %bb.0: 1881; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1882; SSSE3-NEXT: retq 1883; 1884; SSE41-LABEL: combine_blend_01: 1885; SSE41: # %bb.0: 1886; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1887; SSE41-NEXT: retq 1888; 1889; AVX-LABEL: combine_blend_01: 1890; AVX: # %bb.0: 1891; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 1892; AVX-NEXT: retq 1893 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 undef, i32 2, i32 3> 1894 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 3> 1895 ret <4 x float> %shuffle6 1896} 1897 1898define <4 x float> @combine_blend_02(<4 x float> %a, <4 x float> %b) { 1899; SSE2-LABEL: combine_blend_02: 1900; SSE2: # %bb.0: 1901; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1902; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1903; SSE2-NEXT: movaps %xmm1, %xmm0 1904; SSE2-NEXT: retq 1905; 1906; SSSE3-LABEL: combine_blend_02: 1907; SSSE3: # %bb.0: 1908; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3] 1909; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3] 1910; SSSE3-NEXT: movaps %xmm1, %xmm0 1911; SSSE3-NEXT: retq 1912; 1913; SSE41-LABEL: combine_blend_02: 1914; SSE41: # %bb.0: 1915; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1916; SSE41-NEXT: retq 1917; 1918; AVX-LABEL: combine_blend_02: 1919; AVX: # %bb.0: 1920; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 1921; AVX-NEXT: retq 1922 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 undef, i32 3> 1923 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 3> 1924 ret <4 x float> %shuffle6 1925} 1926 1927define <4 x float> @combine_blend_123(<4 x float> %a, <4 x float> %b) { 1928; SSE2-LABEL: combine_blend_123: 1929; SSE2: # %bb.0: 1930; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1931; SSE2-NEXT: movaps %xmm1, %xmm0 1932; SSE2-NEXT: retq 1933; 1934; SSSE3-LABEL: combine_blend_123: 1935; SSSE3: # %bb.0: 1936; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 1937; SSSE3-NEXT: movaps %xmm1, %xmm0 1938; SSSE3-NEXT: retq 1939; 1940; SSE41-LABEL: combine_blend_123: 1941; SSE41: # %bb.0: 1942; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1943; SSE41-NEXT: retq 1944; 1945; AVX-LABEL: combine_blend_123: 1946; AVX: # %bb.0: 1947; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 1948; AVX-NEXT: retq 1949 %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 1950 %shuffle6 = shufflevector <4 x float> %shuffle, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 undef> 1951 %shuffle12 = shufflevector <4 x float> %shuffle6, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7> 1952 ret <4 x float> %shuffle12 1953} 1954 1955define <4 x i32> @combine_test_movhl_1(<4 x i32> %a, <4 x i32> %b) { 1956; SSE-LABEL: combine_test_movhl_1: 1957; SSE: # %bb.0: 1958; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1959; SSE-NEXT: movaps %xmm1, %xmm0 1960; SSE-NEXT: retq 1961; 1962; AVX-LABEL: combine_test_movhl_1: 1963; AVX: # %bb.0: 1964; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1965; AVX-NEXT: retq 1966 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 7, i32 5, i32 3> 1967 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 1, i32 0, i32 3> 1968 ret <4 x i32> %2 1969} 1970 1971define <4 x i32> @combine_test_movhl_2(<4 x i32> %a, <4 x i32> %b) { 1972; SSE-LABEL: combine_test_movhl_2: 1973; SSE: # %bb.0: 1974; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1975; SSE-NEXT: movaps %xmm1, %xmm0 1976; SSE-NEXT: retq 1977; 1978; AVX-LABEL: combine_test_movhl_2: 1979; AVX: # %bb.0: 1980; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1981; AVX-NEXT: retq 1982 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 0, i32 3, i32 6> 1983 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 3, i32 7, i32 0, i32 2> 1984 ret <4 x i32> %2 1985} 1986 1987define <4 x i32> @combine_test_movhl_3(<4 x i32> %a, <4 x i32> %b) { 1988; SSE-LABEL: combine_test_movhl_3: 1989; SSE: # %bb.0: 1990; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] 1991; SSE-NEXT: movaps %xmm1, %xmm0 1992; SSE-NEXT: retq 1993; 1994; AVX-LABEL: combine_test_movhl_3: 1995; AVX: # %bb.0: 1996; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 1997; AVX-NEXT: retq 1998 %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 6, i32 3, i32 2> 1999 %2 = shufflevector <4 x i32> %1, <4 x i32> %b, <4 x i32> <i32 6, i32 0, i32 3, i32 2> 2000 ret <4 x i32> %2 2001} 2002 2003define <16 x i8> @combine_and_or_shuffle(<16 x i8> %x, <16 x i8> %y) { 2004; SSE2-LABEL: combine_and_or_shuffle: 2005; SSE2: # %bb.0: 2006; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 2007; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 2008; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1] 2009; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,3,1,2,4,5,6,7] 2010; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,7,7] 2011; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2012; SSE2-NEXT: pxor %xmm3, %xmm3 2013; SSE2-NEXT: movdqa %xmm1, %xmm0 2014; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] 2015; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] 2016; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,1,3] 2017; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535] 2018; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 2019; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3] 2020; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,0,2,1,4,5,6,7] 2021; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7] 2022; SSE2-NEXT: pand %xmm0, %xmm1 2023; SSE2-NEXT: pandn %xmm4, %xmm0 2024; SSE2-NEXT: por %xmm1, %xmm0 2025; SSE2-NEXT: packuswb %xmm0, %xmm0 2026; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2027; SSE2-NEXT: por %xmm2, %xmm0 2028; SSE2-NEXT: retq 2029; 2030; SSSE3-LABEL: combine_and_or_shuffle: 2031; SSSE3: # %bb.0: 2032; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2033; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2034; SSSE3-NEXT: por %xmm1, %xmm0 2035; SSSE3-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2036; SSSE3-NEXT: retq 2037; 2038; SSE41-LABEL: combine_and_or_shuffle: 2039; SSE41: # %bb.0: 2040; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2041; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2042; SSE41-NEXT: por %xmm1, %xmm0 2043; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2044; SSE41-NEXT: retq 2045; 2046; AVX-LABEL: combine_and_or_shuffle: 2047; AVX: # %bb.0: 2048; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u],zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 2049; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[7,u,0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 2050; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 2051; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2052; AVX-NEXT: retq 2053 %1 = shufflevector <16 x i8> %x, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 15, i32 16, i32 1, i32 16, i32 14, i32 16, i32 2, i32 16, i32 13, i32 16, i32 3, i32 16, i32 16> 2054 %2 = shufflevector <16 x i8> %y, <16 x i8> zeroinitializer, <16 x i32> <i32 7, i32 16, i32 0, i32 16, i32 8, i32 16, i32 1, i32 16, i32 9, i32 16, i32 10, i32 16, i32 7, i32 16, i32 7, i32 16> 2055 %3 = or <16 x i8> %1, %2 2056 %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 2057 ret <16 x i8> %4 2058} 2059 2060; Verify that we fold shuffles according to rule: 2061; (shuffle(shuffle A, Undef, M0), B, M1) -> (shuffle A, B, M2) 2062 2063define <4 x float> @combine_undef_input_test1(<4 x float> %a, <4 x float> %b) { 2064; SSE2-LABEL: combine_undef_input_test1: 2065; SSE2: # %bb.0: 2066; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2067; SSE2-NEXT: retq 2068; 2069; SSSE3-LABEL: combine_undef_input_test1: 2070; SSSE3: # %bb.0: 2071; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2072; SSSE3-NEXT: retq 2073; 2074; SSE41-LABEL: combine_undef_input_test1: 2075; SSE41: # %bb.0: 2076; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2077; SSE41-NEXT: retq 2078; 2079; AVX-LABEL: combine_undef_input_test1: 2080; AVX: # %bb.0: 2081; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2082; AVX-NEXT: retq 2083 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2084 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2085 ret <4 x float> %2 2086} 2087 2088define <4 x float> @combine_undef_input_test2(<4 x float> %a, <4 x float> %b) { 2089; SSE-LABEL: combine_undef_input_test2: 2090; SSE: # %bb.0: 2091; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2092; SSE-NEXT: retq 2093; 2094; AVX-LABEL: combine_undef_input_test2: 2095; AVX: # %bb.0: 2096; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2097; AVX-NEXT: retq 2098 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2099 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2100 ret <4 x float> %2 2101} 2102 2103define <4 x float> @combine_undef_input_test3(<4 x float> %a, <4 x float> %b) { 2104; SSE-LABEL: combine_undef_input_test3: 2105; SSE: # %bb.0: 2106; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2107; SSE-NEXT: retq 2108; 2109; AVX-LABEL: combine_undef_input_test3: 2110; AVX: # %bb.0: 2111; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2112; AVX-NEXT: retq 2113 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2114 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2115 ret <4 x float> %2 2116} 2117 2118define <4 x float> @combine_undef_input_test4(<4 x float> %a, <4 x float> %b) { 2119; SSE-LABEL: combine_undef_input_test4: 2120; SSE: # %bb.0: 2121; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2122; SSE-NEXT: retq 2123; 2124; AVX-LABEL: combine_undef_input_test4: 2125; AVX: # %bb.0: 2126; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2127; AVX-NEXT: retq 2128 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2129 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2130 ret <4 x float> %2 2131} 2132 2133define <4 x float> @combine_undef_input_test5(<4 x float> %a, <4 x float> %b) { 2134; SSE2-LABEL: combine_undef_input_test5: 2135; SSE2: # %bb.0: 2136; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2137; SSE2-NEXT: retq 2138; 2139; SSSE3-LABEL: combine_undef_input_test5: 2140; SSSE3: # %bb.0: 2141; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2142; SSSE3-NEXT: retq 2143; 2144; SSE41-LABEL: combine_undef_input_test5: 2145; SSE41: # %bb.0: 2146; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2147; SSE41-NEXT: retq 2148; 2149; AVX-LABEL: combine_undef_input_test5: 2150; AVX: # %bb.0: 2151; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2152; AVX-NEXT: retq 2153 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2154 %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2155 ret <4 x float> %2 2156} 2157 2158 2159; Verify that we fold shuffles according to rule: 2160; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2161 2162define <4 x float> @combine_undef_input_test6(<4 x float> %a) { 2163; CHECK-LABEL: combine_undef_input_test6: 2164; CHECK: # %bb.0: 2165; CHECK-NEXT: retq 2166 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2167 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 1, i32 2> 2168 ret <4 x float> %2 2169} 2170 2171define <4 x float> @combine_undef_input_test7(<4 x float> %a) { 2172; SSE2-LABEL: combine_undef_input_test7: 2173; SSE2: # %bb.0: 2174; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2175; SSE2-NEXT: retq 2176; 2177; SSSE3-LABEL: combine_undef_input_test7: 2178; SSSE3: # %bb.0: 2179; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2180; SSSE3-NEXT: retq 2181; 2182; SSE41-LABEL: combine_undef_input_test7: 2183; SSE41: # %bb.0: 2184; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2185; SSE41-NEXT: retq 2186; 2187; AVX-LABEL: combine_undef_input_test7: 2188; AVX: # %bb.0: 2189; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2190; AVX-NEXT: retq 2191 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2192 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 1, i32 2, i32 4, i32 5> 2193 ret <4 x float> %2 2194} 2195 2196define <4 x float> @combine_undef_input_test8(<4 x float> %a) { 2197; SSE2-LABEL: combine_undef_input_test8: 2198; SSE2: # %bb.0: 2199; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2200; SSE2-NEXT: retq 2201; 2202; SSSE3-LABEL: combine_undef_input_test8: 2203; SSSE3: # %bb.0: 2204; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2205; SSSE3-NEXT: retq 2206; 2207; SSE41-LABEL: combine_undef_input_test8: 2208; SSE41: # %bb.0: 2209; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2210; SSE41-NEXT: retq 2211; 2212; AVX-LABEL: combine_undef_input_test8: 2213; AVX: # %bb.0: 2214; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2215; AVX-NEXT: retq 2216 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2217 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 4, i32 1> 2218 ret <4 x float> %2 2219} 2220 2221define <4 x float> @combine_undef_input_test9(<4 x float> %a) { 2222; SSE-LABEL: combine_undef_input_test9: 2223; SSE: # %bb.0: 2224; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2225; SSE-NEXT: retq 2226; 2227; AVX-LABEL: combine_undef_input_test9: 2228; AVX: # %bb.0: 2229; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2230; AVX-NEXT: retq 2231 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2232 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 6, i32 7, i32 0, i32 1> 2233 ret <4 x float> %2 2234} 2235 2236define <4 x float> @combine_undef_input_test10(<4 x float> %a) { 2237; CHECK-LABEL: combine_undef_input_test10: 2238; CHECK: # %bb.0: 2239; CHECK-NEXT: retq 2240 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2241 %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 2, i32 6, i32 7> 2242 ret <4 x float> %2 2243} 2244 2245define <4 x float> @combine_undef_input_test11(<4 x float> %a, <4 x float> %b) { 2246; SSE2-LABEL: combine_undef_input_test11: 2247; SSE2: # %bb.0: 2248; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2249; SSE2-NEXT: retq 2250; 2251; SSSE3-LABEL: combine_undef_input_test11: 2252; SSSE3: # %bb.0: 2253; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2254; SSSE3-NEXT: retq 2255; 2256; SSE41-LABEL: combine_undef_input_test11: 2257; SSE41: # %bb.0: 2258; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2259; SSE41-NEXT: retq 2260; 2261; AVX-LABEL: combine_undef_input_test11: 2262; AVX: # %bb.0: 2263; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] 2264; AVX-NEXT: retq 2265 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2266 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 6> 2267 ret <4 x float> %2 2268} 2269 2270define <4 x float> @combine_undef_input_test12(<4 x float> %a, <4 x float> %b) { 2271; SSE-LABEL: combine_undef_input_test12: 2272; SSE: # %bb.0: 2273; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2274; SSE-NEXT: retq 2275; 2276; AVX-LABEL: combine_undef_input_test12: 2277; AVX: # %bb.0: 2278; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2279; AVX-NEXT: retq 2280 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2281 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2282 ret <4 x float> %2 2283} 2284 2285define <4 x float> @combine_undef_input_test13(<4 x float> %a, <4 x float> %b) { 2286; SSE-LABEL: combine_undef_input_test13: 2287; SSE: # %bb.0: 2288; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2289; SSE-NEXT: retq 2290; 2291; AVX-LABEL: combine_undef_input_test13: 2292; AVX: # %bb.0: 2293; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2294; AVX-NEXT: retq 2295 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2296 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 5, i32 0, i32 5> 2297 ret <4 x float> %2 2298} 2299 2300define <4 x float> @combine_undef_input_test14(<4 x float> %a, <4 x float> %b) { 2301; SSE-LABEL: combine_undef_input_test14: 2302; SSE: # %bb.0: 2303; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2304; SSE-NEXT: retq 2305; 2306; AVX-LABEL: combine_undef_input_test14: 2307; AVX: # %bb.0: 2308; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] 2309; AVX-NEXT: retq 2310 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2311 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2312 ret <4 x float> %2 2313} 2314 2315define <4 x float> @combine_undef_input_test15(<4 x float> %a, <4 x float> %b) { 2316; SSE2-LABEL: combine_undef_input_test15: 2317; SSE2: # %bb.0: 2318; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2319; SSE2-NEXT: retq 2320; 2321; SSSE3-LABEL: combine_undef_input_test15: 2322; SSSE3: # %bb.0: 2323; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2324; SSSE3-NEXT: retq 2325; 2326; SSE41-LABEL: combine_undef_input_test15: 2327; SSE41: # %bb.0: 2328; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2329; SSE41-NEXT: retq 2330; 2331; AVX-LABEL: combine_undef_input_test15: 2332; AVX: # %bb.0: 2333; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 2334; AVX-NEXT: retq 2335 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2336 %2 = shufflevector <4 x float> %b, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2337 ret <4 x float> %2 2338} 2339 2340 2341; Verify that shuffles are canonicalized according to rules: 2342; shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) 2343; 2344; This allows to trigger the following combine rule: 2345; (shuffle(shuffle A, Undef, M0), A, M1) -> (shuffle A, Undef, M2) 2346; 2347; As a result, all the shuffle pairs in each function below should be 2348; combined into a single legal shuffle operation. 2349 2350define <4 x float> @combine_undef_input_test16(<4 x float> %a) { 2351; CHECK-LABEL: combine_undef_input_test16: 2352; CHECK: # %bb.0: 2353; CHECK-NEXT: retq 2354 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 4, i32 2, i32 3, i32 1> 2355 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 0, i32 1, i32 5, i32 3> 2356 ret <4 x float> %2 2357} 2358 2359define <4 x float> @combine_undef_input_test17(<4 x float> %a) { 2360; SSE2-LABEL: combine_undef_input_test17: 2361; SSE2: # %bb.0: 2362; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2363; SSE2-NEXT: retq 2364; 2365; SSSE3-LABEL: combine_undef_input_test17: 2366; SSSE3: # %bb.0: 2367; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2368; SSSE3-NEXT: retq 2369; 2370; SSE41-LABEL: combine_undef_input_test17: 2371; SSE41: # %bb.0: 2372; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2373; SSE41-NEXT: retq 2374; 2375; AVX-LABEL: combine_undef_input_test17: 2376; AVX: # %bb.0: 2377; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2378; AVX-NEXT: retq 2379 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 6, i32 0, i32 1, i32 7> 2380 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 5, i32 6, i32 0, i32 1> 2381 ret <4 x float> %2 2382} 2383 2384define <4 x float> @combine_undef_input_test18(<4 x float> %a) { 2385; SSE2-LABEL: combine_undef_input_test18: 2386; SSE2: # %bb.0: 2387; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] 2388; SSE2-NEXT: retq 2389; 2390; SSSE3-LABEL: combine_undef_input_test18: 2391; SSSE3: # %bb.0: 2392; SSSE3-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2393; SSSE3-NEXT: retq 2394; 2395; SSE41-LABEL: combine_undef_input_test18: 2396; SSE41: # %bb.0: 2397; SSE41-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] 2398; SSE41-NEXT: retq 2399; 2400; AVX-LABEL: combine_undef_input_test18: 2401; AVX: # %bb.0: 2402; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] 2403; AVX-NEXT: retq 2404 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 1, i32 7> 2405 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 0, i32 5> 2406 ret <4 x float> %2 2407} 2408 2409define <4 x float> @combine_undef_input_test19(<4 x float> %a) { 2410; SSE-LABEL: combine_undef_input_test19: 2411; SSE: # %bb.0: 2412; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 2413; SSE-NEXT: retq 2414; 2415; AVX-LABEL: combine_undef_input_test19: 2416; AVX: # %bb.0: 2417; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] 2418; AVX-NEXT: retq 2419 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 5, i32 5> 2420 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 2, i32 3, i32 4, i32 5> 2421 ret <4 x float> %2 2422} 2423 2424define <4 x float> @combine_undef_input_test20(<4 x float> %a) { 2425; CHECK-LABEL: combine_undef_input_test20: 2426; CHECK: # %bb.0: 2427; CHECK-NEXT: retq 2428 %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 4, i32 1, i32 3> 2429 %2 = shufflevector <4 x float> %a, <4 x float> %1, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2430 ret <4 x float> %2 2431} 2432 2433; These tests are designed to test the ability to combine away unnecessary 2434; operations feeding into a shuffle. The AVX cases are the important ones as 2435; they leverage operations which cannot be done naturally on the entire vector 2436; and thus are decomposed into multiple smaller operations. 2437 2438define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) { 2439; SSE-LABEL: combine_unneeded_subvector1: 2440; SSE: # %bb.0: 2441; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,2,1,0] 2442; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2443; SSE-NEXT: movdqa %xmm0, %xmm1 2444; SSE-NEXT: retq 2445; 2446; AVX1-LABEL: combine_unneeded_subvector1: 2447; AVX1: # %bb.0: 2448; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2449; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2450; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2451; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2452; AVX1-NEXT: retq 2453; 2454; AVX2-SLOW-LABEL: combine_unneeded_subvector1: 2455; AVX2-SLOW: # %bb.0: 2456; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2457; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2458; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2459; AVX2-SLOW-NEXT: retq 2460; 2461; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: 2462; AVX2-FAST-ALL: # %bb.0: 2463; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2464; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] 2465; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] 2466; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2467; AVX2-FAST-ALL-NEXT: retq 2468; 2469; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1: 2470; AVX2-FAST-PERLANE: # %bb.0: 2471; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2472; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2473; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] 2474; AVX2-FAST-PERLANE-NEXT: retq 2475 %b = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2476 %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4> 2477 ret <8 x i32> %c 2478} 2479 2480define <8 x i32> @combine_unneeded_subvector2(<8 x i32> %a, <8 x i32> %b) { 2481; SSE-LABEL: combine_unneeded_subvector2: 2482; SSE: # %bb.0: 2483; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,2,1,0] 2484; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,2,1,0] 2485; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2486; SSE-NEXT: retq 2487; 2488; AVX1-LABEL: combine_unneeded_subvector2: 2489; AVX1: # %bb.0: 2490; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2491; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2492; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 2493; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2494; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2495; AVX1-NEXT: retq 2496; 2497; AVX2-LABEL: combine_unneeded_subvector2: 2498; AVX2: # %bb.0: 2499; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2500; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] 2501; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] 2502; AVX2-NEXT: retq 2503 %c = add <8 x i32> %a, <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8> 2504 %d = shufflevector <8 x i32> %b, <8 x i32> %c, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 15, i32 14, i32 13, i32 12> 2505 ret <8 x i32> %d 2506} 2507 2508define <4 x float> @combine_insertps1(<4 x float> %a, <4 x float> %b) { 2509; SSE2-LABEL: combine_insertps1: 2510; SSE2: # %bb.0: 2511; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2512; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2513; SSE2-NEXT: movaps %xmm1, %xmm0 2514; SSE2-NEXT: retq 2515; 2516; SSSE3-LABEL: combine_insertps1: 2517; SSSE3: # %bb.0: 2518; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2519; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,3] 2520; SSSE3-NEXT: movaps %xmm1, %xmm0 2521; SSSE3-NEXT: retq 2522; 2523; SSE41-LABEL: combine_insertps1: 2524; SSE41: # %bb.0: 2525; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2526; SSE41-NEXT: retq 2527; 2528; AVX-LABEL: combine_insertps1: 2529; AVX: # %bb.0: 2530; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[2],xmm0[1,2,3] 2531; AVX-NEXT: retq 2532 2533 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 6, i32 2, i32 4> 2534 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 5, i32 1, i32 6, i32 3> 2535 ret <4 x float> %d 2536} 2537 2538define <4 x float> @combine_insertps2(<4 x float> %a, <4 x float> %b) { 2539; SSE2-LABEL: combine_insertps2: 2540; SSE2: # %bb.0: 2541; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2542; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2543; SSE2-NEXT: movaps %xmm1, %xmm0 2544; SSE2-NEXT: retq 2545; 2546; SSSE3-LABEL: combine_insertps2: 2547; SSSE3: # %bb.0: 2548; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[0,0] 2549; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 2550; SSSE3-NEXT: movaps %xmm1, %xmm0 2551; SSSE3-NEXT: retq 2552; 2553; SSE41-LABEL: combine_insertps2: 2554; SSE41: # %bb.0: 2555; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2556; SSE41-NEXT: retq 2557; 2558; AVX-LABEL: combine_insertps2: 2559; AVX: # %bb.0: 2560; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[2],xmm0[2,3] 2561; AVX-NEXT: retq 2562 2563 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 1, i32 6, i32 7> 2564 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32> <i32 4, i32 6, i32 2, i32 3> 2565 ret <4 x float> %d 2566} 2567 2568define <4 x float> @combine_insertps3(<4 x float> %a, <4 x float> %b) { 2569; SSE2-LABEL: combine_insertps3: 2570; SSE2: # %bb.0: 2571; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2572; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2573; SSE2-NEXT: retq 2574; 2575; SSSE3-LABEL: combine_insertps3: 2576; SSSE3: # %bb.0: 2577; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] 2578; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2579; SSSE3-NEXT: retq 2580; 2581; SSE41-LABEL: combine_insertps3: 2582; SSE41: # %bb.0: 2583; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2584; SSE41-NEXT: retq 2585; 2586; AVX-LABEL: combine_insertps3: 2587; AVX: # %bb.0: 2588; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] 2589; AVX-NEXT: retq 2590 2591 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2592 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 5, i32 3> 2593 ret <4 x float> %d 2594} 2595 2596define <4 x float> @combine_insertps4(<4 x float> %a, <4 x float> %b) { 2597; SSE2-LABEL: combine_insertps4: 2598; SSE2: # %bb.0: 2599; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2600; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2601; SSE2-NEXT: retq 2602; 2603; SSSE3-LABEL: combine_insertps4: 2604; SSSE3: # %bb.0: 2605; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3] 2606; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] 2607; SSSE3-NEXT: retq 2608; 2609; SSE41-LABEL: combine_insertps4: 2610; SSE41: # %bb.0: 2611; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2612; SSE41-NEXT: retq 2613; 2614; AVX-LABEL: combine_insertps4: 2615; AVX: # %bb.0: 2616; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] 2617; AVX-NEXT: retq 2618 2619 %c = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32><i32 0, i32 4, i32 2, i32 5> 2620 %d = shufflevector <4 x float> %a, <4 x float> %c, <4 x i32><i32 4, i32 1, i32 6, i32 5> 2621 ret <4 x float> %d 2622} 2623 2624define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) { 2625; SSE-LABEL: combine_scalar_load_with_blend_with_zero: 2626; SSE: # %bb.0: 2627; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero 2628; SSE-NEXT: movaps %xmm0, (%rsi) 2629; SSE-NEXT: retq 2630; 2631; AVX-LABEL: combine_scalar_load_with_blend_with_zero: 2632; AVX: # %bb.0: 2633; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2634; AVX-NEXT: vmovaps %xmm0, (%rsi) 2635; AVX-NEXT: retq 2636 %1 = load double, ptr %a0, align 8 2637 %2 = insertelement <2 x double> undef, double %1, i32 0 2638 %3 = insertelement <2 x double> %2, double 0.000000e+00, i32 1 2639 %4 = bitcast <2 x double> %3 to <4 x float> 2640 %5 = shufflevector <4 x float> %4, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 1, i32 4, i32 3> 2641 store <4 x float> %5, ptr %a1, align 16 2642 ret void 2643} 2644 2645; PR30371 2646define <4 x float> @combine_constant_insertion_v4f32(float %f) { 2647; SSE2-LABEL: combine_constant_insertion_v4f32: 2648; SSE2: # %bb.0: 2649; SSE2-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2650; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2651; SSE2-NEXT: movaps %xmm1, %xmm0 2652; SSE2-NEXT: retq 2653; 2654; SSSE3-LABEL: combine_constant_insertion_v4f32: 2655; SSSE3: # %bb.0: 2656; SSSE3-NEXT: movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0> 2657; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 2658; SSSE3-NEXT: movaps %xmm1, %xmm0 2659; SSSE3-NEXT: retq 2660; 2661; SSE41-LABEL: combine_constant_insertion_v4f32: 2662; SSE41: # %bb.0: 2663; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2664; SSE41-NEXT: retq 2665; 2666; AVX-LABEL: combine_constant_insertion_v4f32: 2667; AVX: # %bb.0: 2668; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1,2,3] 2669; AVX-NEXT: retq 2670 %a0 = insertelement <4 x float> undef, float %f, i32 0 2671 %ret = shufflevector <4 x float> %a0, <4 x float> <float undef, float 4.0, float 5.0, float 3.0>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2672 ret <4 x float> %ret 2673} 2674 2675define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) { 2676; SSE2-LABEL: combine_constant_insertion_v4i32: 2677; SSE2: # %bb.0: 2678; SSE2-NEXT: movd %edi, %xmm1 2679; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2680; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2681; SSE2-NEXT: retq 2682; 2683; SSSE3-LABEL: combine_constant_insertion_v4i32: 2684; SSSE3: # %bb.0: 2685; SSSE3-NEXT: movd %edi, %xmm1 2686; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,4,5,30> 2687; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 2688; SSSE3-NEXT: retq 2689; 2690; SSE41-LABEL: combine_constant_insertion_v4i32: 2691; SSE41: # %bb.0: 2692; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <u,4,5,30> 2693; SSE41-NEXT: pinsrd $0, %edi, %xmm0 2694; SSE41-NEXT: retq 2695; 2696; AVX-LABEL: combine_constant_insertion_v4i32: 2697; AVX: # %bb.0: 2698; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <u,4,5,30> 2699; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0 2700; AVX-NEXT: retq 2701 %a0 = insertelement <4 x i32> undef, i32 %f, i32 0 2702 %ret = shufflevector <4 x i32> %a0, <4 x i32> <i32 undef, i32 4, i32 5, i32 30>, <4 x i32> <i32 0, i32 5, i32 6, i32 7> 2703 ret <4 x i32> %ret 2704} 2705 2706define <4 x float> @PR22377(<4 x float> %a, <4 x float> %b) { 2707; SSE2-LABEL: PR22377: 2708; SSE2: # %bb.0: # %entry 2709; SSE2-NEXT: movaps %xmm0, %xmm1 2710; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[2,3] 2711; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,0,2] 2712; SSE2-NEXT: addps %xmm0, %xmm1 2713; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2714; SSE2-NEXT: retq 2715; 2716; SSSE3-LABEL: PR22377: 2717; SSSE3: # %bb.0: # %entry 2718; SSSE3-NEXT: movaps %xmm0, %xmm1 2719; SSSE3-NEXT: haddps %xmm0, %xmm1 2720; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2721; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2722; SSSE3-NEXT: retq 2723; 2724; SSE41-LABEL: PR22377: 2725; SSE41: # %bb.0: # %entry 2726; SSE41-NEXT: movaps %xmm0, %xmm1 2727; SSE41-NEXT: haddps %xmm0, %xmm1 2728; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2729; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2730; SSE41-NEXT: retq 2731; 2732; AVX-LABEL: PR22377: 2733; AVX: # %bb.0: # %entry 2734; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 2735; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] 2736; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] 2737; AVX-NEXT: retq 2738entry: 2739 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 1, i32 3> 2740 %s2 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2> 2741 %r2 = fadd <4 x float> %s1, %s2 2742 %s3 = shufflevector <4 x float> %s2, <4 x float> %r2, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 2743 ret <4 x float> %s3 2744} 2745 2746define <4 x float> @PR22390(<4 x float> %a, <4 x float> %b) { 2747; SSE2-LABEL: PR22390: 2748; SSE2: # %bb.0: # %entry 2749; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2750; SSE2-NEXT: movaps %xmm0, %xmm2 2751; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2752; SSE2-NEXT: addps %xmm2, %xmm0 2753; SSE2-NEXT: retq 2754; 2755; SSSE3-LABEL: PR22390: 2756; SSSE3: # %bb.0: # %entry 2757; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2758; SSSE3-NEXT: movaps %xmm0, %xmm2 2759; SSSE3-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 2760; SSSE3-NEXT: addps %xmm2, %xmm0 2761; SSSE3-NEXT: retq 2762; 2763; SSE41-LABEL: PR22390: 2764; SSE41: # %bb.0: # %entry 2765; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2766; SSE41-NEXT: blendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2767; SSE41-NEXT: addps %xmm1, %xmm0 2768; SSE41-NEXT: retq 2769; 2770; AVX-LABEL: PR22390: 2771; AVX: # %bb.0: # %entry 2772; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] 2773; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] 2774; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 2775; AVX-NEXT: retq 2776entry: 2777 %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2> 2778 %s2 = shufflevector <4 x float> %s1, <4 x float> %b, <4 x i32> <i32 4, i32 1, i32 2, i32 3> 2779 %r2 = fadd <4 x float> %s1, %s2 2780 ret <4 x float> %r2 2781} 2782 2783define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { 2784; SSE-LABEL: PR22412: 2785; SSE: # %bb.0: # %entry 2786; SSE-NEXT: movaps %xmm3, %xmm1 2787; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] 2788; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[3,2] 2789; SSE-NEXT: retq 2790; 2791; AVX1-LABEL: PR22412: 2792; AVX1: # %bb.0: # %entry 2793; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] 2794; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2795; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm2[3,2],ymm0[5,4],ymm2[7,6] 2796; AVX1-NEXT: retq 2797; 2798; AVX2-LABEL: PR22412: 2799; AVX2: # %bb.0: # %entry 2800; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 2801; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] 2802; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] 2803; AVX2-NEXT: retq 2804entry: 2805 %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2806 %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2> 2807 ret <8 x float> %s2 2808} 2809 2810define <4 x float> @PR30264(<4 x float> %x) { 2811; SSE2-LABEL: PR30264: 2812; SSE2: # %bb.0: 2813; SSE2-NEXT: xorps %xmm1, %xmm1 2814; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2815; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2816; SSE2-NEXT: movaps %xmm1, %xmm0 2817; SSE2-NEXT: retq 2818; 2819; SSSE3-LABEL: PR30264: 2820; SSSE3: # %bb.0: 2821; SSSE3-NEXT: xorps %xmm1, %xmm1 2822; SSSE3-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2823; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],mem[2,3] 2824; SSSE3-NEXT: movaps %xmm1, %xmm0 2825; SSSE3-NEXT: retq 2826; 2827; SSE41-LABEL: PR30264: 2828; SSE41: # %bb.0: 2829; SSE41-NEXT: movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2830; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3] 2831; SSE41-NEXT: movaps %xmm1, %xmm0 2832; SSE41-NEXT: retq 2833; 2834; AVX-LABEL: PR30264: 2835; AVX: # %bb.0: 2836; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0> 2837; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2,3] 2838; AVX-NEXT: retq 2839 %shuf1 = shufflevector <4 x float> %x, <4 x float> <float undef, float 0.0, float undef, float undef>, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef> 2840 %shuf2 = shufflevector <4 x float> %shuf1, <4 x float> <float undef, float undef, float 4.0, float 1.0>, <4 x i32> <i32 0, i32 1, i32 6, i32 7> 2841 ret <4 x float> %shuf2 2842} 2843 2844define <8 x i16> @PR39549(<16 x i8> %x) { 2845; SSE-LABEL: PR39549: 2846; SSE: # %bb.0: 2847; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2848; SSE-NEXT: psraw $8, %xmm0 2849; SSE-NEXT: retq 2850; 2851; AVX-LABEL: PR39549: 2852; AVX: # %bb.0: 2853; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 2854; AVX-NEXT: vpsraw $8, %xmm0, %xmm0 2855; AVX-NEXT: retq 2856 %a = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 9, i32 undef, i32 10, i32 undef, i32 11, i32 undef, i32 12, i32 undef, i32 13, i32 undef, i32 14, i32 undef, i32 15, i32 undef> 2857 %b = bitcast <16 x i8> %a to <8 x i16> 2858 %c = shl <8 x i16> %b, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2859 %d = ashr <8 x i16> %c, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2860 ret <8 x i16> %d 2861} 2862 2863define <4 x i32> @PR41545(<4 x i32> %a0, <16 x i8> %a1) { 2864; SSE-LABEL: PR41545: 2865; SSE: # %bb.0: 2866; SSE-NEXT: paddd %xmm1, %xmm0 2867; SSE-NEXT: retq 2868; 2869; AVX-LABEL: PR41545: 2870; AVX: # %bb.0: 2871; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2872; AVX-NEXT: retq 2873 %1 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 2874 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13> 2875 %3 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14> 2876 %4 = shufflevector <16 x i8> %a1, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15> 2877 %5 = zext <4 x i8> %1 to <4 x i32> 2878 %6 = zext <4 x i8> %2 to <4 x i32> 2879 %7 = zext <4 x i8> %3 to <4 x i32> 2880 %8 = zext <4 x i8> %4 to <4 x i32> 2881 %9 = shl <4 x i32> %6, <i32 8, i32 8, i32 8, i32 8> 2882 %10 = shl <4 x i32> %7, <i32 16, i32 16, i32 16, i32 16> 2883 %11 = shl <4 x i32> %8, <i32 24, i32 24, i32 24, i32 24> 2884 %12 = or <4 x i32> %5, %9 2885 %13 = or <4 x i32> %12, %10 2886 %14 = or <4 x i32> %13, %11 2887 %15 = add <4 x i32> %a0, %14 2888 ret <4 x i32> %15 2889} 2890 2891define <8 x i16> @shuffle_extract_insert(<8 x i16> %a) { 2892; SSE-LABEL: shuffle_extract_insert: 2893; SSE: # %bb.0: 2894; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2895; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2896; SSE-NEXT: retq 2897; 2898; AVX1-LABEL: shuffle_extract_insert: 2899; AVX1: # %bb.0: 2900; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2901; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2902; AVX1-NEXT: retq 2903; 2904; AVX2-SLOW-LABEL: shuffle_extract_insert: 2905; AVX2-SLOW: # %bb.0: 2906; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7] 2907; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7] 2908; AVX2-SLOW-NEXT: retq 2909; 2910; AVX2-FAST-LABEL: shuffle_extract_insert: 2911; AVX2-FAST: # %bb.0: 2912; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,2,3,0,1,6,7,12,13,10,11,8,9,14,15] 2913; AVX2-FAST-NEXT: retq 2914 %a0 = extractelement <8 x i16> %a, i32 0 2915 %a1 = extractelement <8 x i16> %a, i32 1 2916 %a3 = extractelement <8 x i16> %a, i32 3 2917 %a4 = extractelement <8 x i16> %a, i32 4 2918 %a5 = extractelement <8 x i16> %a, i32 5 2919 %a6 = extractelement <8 x i16> %a, i32 6 2920 %a7 = extractelement <8 x i16> %a, i32 7 2921 %1 = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 2, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2922 %2 = insertelement <8 x i16> %1, i16 %a1, i32 1 2923 %3 = insertelement <8 x i16> %2, i16 %a0, i32 2 2924 %4 = insertelement <8 x i16> %3, i16 %a3, i32 3 2925 %5 = insertelement <8 x i16> %4, i16 %a6, i32 4 2926 %6 = insertelement <8 x i16> %5, i16 %a5, i32 5 2927 %7 = insertelement <8 x i16> %6, i16 %a4, i32 6 2928 %8 = insertelement <8 x i16> %7, i16 %a7, i32 7 2929 ret <8 x i16> %8 2930} 2931 2932define <8 x i16> @shuffle_extract_insert_double(<8 x i16> %a, <8 x i16> %b) { 2933; SSE2-LABEL: shuffle_extract_insert_double: 2934; SSE2: # %bb.0: 2935; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 2936; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2937; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2938; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2939; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2940; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2941; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2942; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2943; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2944; SSE2-NEXT: retq 2945; 2946; SSSE3-LABEL: shuffle_extract_insert_double: 2947; SSSE3: # %bb.0: 2948; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2949; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2950; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2951; SSSE3-NEXT: retq 2952; 2953; SSE41-LABEL: shuffle_extract_insert_double: 2954; SSE41: # %bb.0: 2955; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2956; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2957; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2958; SSE41-NEXT: retq 2959; 2960; AVX-LABEL: shuffle_extract_insert_double: 2961; AVX: # %bb.0: 2962; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 2963; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 2964; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2965; AVX-NEXT: retq 2966 %a0 = extractelement <8 x i16> %a, i32 0 2967 %a4 = extractelement <8 x i16> %a, i32 4 2968 %a6 = extractelement <8 x i16> %a, i32 6 2969 %b11 = extractelement <8 x i16> %b, i32 3 2970 %b13 = extractelement <8 x i16> %b, i32 5 2971 %b15 = extractelement <8 x i16> %b, i32 7 2972 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 2973 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 2974 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 2975 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 2976 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 2977 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 2978 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 2979 ret <8 x i16> %7 2980} 2981 2982define <8 x i16> @shuffle_extract_concat_insert(<4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i16> %b) { 2983; SSE2-LABEL: shuffle_extract_concat_insert: 2984; SSE2: # %bb.0: 2985; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2986; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2987; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2988; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2989; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 2990; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,3,2,3,4,5,6,7] 2991; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7] 2992; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2993; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7] 2994; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2995; SSE2-NEXT: retq 2996; 2997; SSSE3-LABEL: shuffle_extract_concat_insert: 2998; SSSE3: # %bb.0: 2999; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3000; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3001; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3002; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3003; SSSE3-NEXT: retq 3004; 3005; SSE41-LABEL: shuffle_extract_concat_insert: 3006; SSE41: # %bb.0: 3007; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3008; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3009; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3010; SSE41-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3011; SSE41-NEXT: retq 3012; 3013; AVX-LABEL: shuffle_extract_concat_insert: 3014; AVX: # %bb.0: 3015; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3016; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u] 3017; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[0,1,6,7,10,11,14,15,u,u,u,u,u,u,u,u] 3018; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3019; AVX-NEXT: retq 3020 %a = shufflevector <4 x i16> %lhsa, <4 x i16> %rhsa, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3021 %a0 = extractelement <8 x i16> %a, i32 0 3022 %a4 = extractelement <8 x i16> %a, i32 4 3023 %a6 = extractelement <8 x i16> %a, i32 6 3024 %b11 = extractelement <8 x i16> %b, i32 3 3025 %b13 = extractelement <8 x i16> %b, i32 5 3026 %b15 = extractelement <8 x i16> %b, i32 7 3027 %1 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 8, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef> 3028 %2 = insertelement <8 x i16> %1, i16 %a0, i32 2 3029 %3 = insertelement <8 x i16> %2, i16 %b11, i32 3 3030 %4 = insertelement <8 x i16> %3, i16 %a6, i32 4 3031 %5 = insertelement <8 x i16> %4, i16 %b13, i32 5 3032 %6 = insertelement <8 x i16> %5, i16 %a4, i32 6 3033 %7 = insertelement <8 x i16> %6, i16 %b15, i32 7 3034 ret <8 x i16> %7 3035} 3036 3037define <8 x i16> @shuffle_scalar_to_vector_extract(ptr %p0, ptr %p1, ptr %p2) { 3038; SSE2-LABEL: shuffle_scalar_to_vector_extract: 3039; SSE2: # %bb.0: 3040; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3041; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3042; SSE2-NEXT: psraw $8, %xmm1 3043; SSE2-NEXT: pextrw $7, %xmm1, %eax 3044; SSE2-NEXT: movd %eax, %xmm2 3045; SSE2-NEXT: movsbl (%rsi), %eax 3046; SSE2-NEXT: movd %eax, %xmm0 3047; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 3048; SSE2-NEXT: movsbl (%rdx), %eax 3049; SSE2-NEXT: movd %eax, %xmm0 3050; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3051; SSE2-NEXT: pxor %xmm0, %xmm0 3052; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3053; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3054; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3055; SSE2-NEXT: retq 3056; 3057; SSSE3-LABEL: shuffle_scalar_to_vector_extract: 3058; SSSE3: # %bb.0: 3059; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 3060; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 3061; SSSE3-NEXT: psraw $8, %xmm1 3062; SSSE3-NEXT: movsbl (%rsi), %eax 3063; SSSE3-NEXT: movd %eax, %xmm2 3064; SSSE3-NEXT: palignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13] 3065; SSSE3-NEXT: movsbl (%rdx), %eax 3066; SSSE3-NEXT: movd %eax, %xmm0 3067; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 3068; SSSE3-NEXT: pxor %xmm0, %xmm0 3069; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 3070; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 3071; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 3072; SSSE3-NEXT: retq 3073; 3074; SSE41-LABEL: shuffle_scalar_to_vector_extract: 3075; SSE41: # %bb.0: 3076; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 3077; SSE41-NEXT: pextrw $4, %xmm0, %eax 3078; SSE41-NEXT: pextrw $7, %xmm0, %ecx 3079; SSE41-NEXT: pxor %xmm0, %xmm0 3080; SSE41-NEXT: pinsrw $1, %eax, %xmm0 3081; SSE41-NEXT: movl $65531, %eax # imm = 0xFFFB 3082; SSE41-NEXT: pinsrw $2, %eax, %xmm0 3083; SSE41-NEXT: pinsrw $4, %ecx, %xmm0 3084; SSE41-NEXT: movsbl (%rsi), %eax 3085; SSE41-NEXT: pinsrw $5, %eax, %xmm0 3086; SSE41-NEXT: movsbl (%rdx), %eax 3087; SSE41-NEXT: pinsrw $6, %eax, %xmm0 3088; SSE41-NEXT: retq 3089; 3090; AVX-LABEL: shuffle_scalar_to_vector_extract: 3091; AVX: # %bb.0: 3092; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 3093; AVX-NEXT: vpextrw $4, %xmm0, %eax 3094; AVX-NEXT: vpextrw $7, %xmm0, %ecx 3095; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3096; AVX-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0 3097; AVX-NEXT: movl $65531, %eax # imm = 0xFFFB 3098; AVX-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 3099; AVX-NEXT: vpinsrw $4, %ecx, %xmm0, %xmm0 3100; AVX-NEXT: movsbl (%rsi), %eax 3101; AVX-NEXT: vpinsrw $5, %eax, %xmm0, %xmm0 3102; AVX-NEXT: movsbl (%rdx), %eax 3103; AVX-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0 3104; AVX-NEXT: retq 3105 %tmp = load <8 x i8>, ptr %p0, align 1 3106 %tmp1 = sext <8 x i8> %tmp to <8 x i16> 3107 %tmp2 = load i8, ptr %p1, align 1 3108 %cvt1 = sext i8 %tmp2 to i16 3109 %tmp3 = load i8, ptr %p2, align 1 3110 %cvt2 = sext i8 %tmp3 to i16 3111 %tmp4 = extractelement <8 x i16> %tmp1, i32 4 3112 %tmp5 = extractelement <8 x i16> %tmp1, i32 7 3113 %tmp6 = insertelement <8 x i16> <i16 undef, i16 undef, i16 -5, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 undef, i32 0 3114 %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp4, i32 1 3115 %tmp8 = insertelement <8 x i16> %tmp7, i16 undef, i32 3 3116 %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp5, i32 4 3117 %tmp10 = insertelement <8 x i16> %tmp9, i16 %cvt1, i32 5 3118 %tmp11 = insertelement <8 x i16> %tmp10, i16 %cvt2, i32 6 3119 %tmp12 = insertelement <8 x i16> %tmp11, i16 undef, i32 7 3120 %tmp13 = shufflevector <8 x i16> %tmp12, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7> 3121 ret <8 x i16> %tmp13 3122} 3123 3124; Bug noticed in D96345 3125define i32 @shuffle_binops_with_undef() { 3126; SSE-LABEL: shuffle_binops_with_undef: 3127; SSE: # %bb.0: # %entry 3128; SSE-NEXT: movdqa (%rax), %xmm0 3129; SSE-NEXT: paddw %xmm0, %xmm0 3130; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3131; SSE-NEXT: psrlw %xmm1, %xmm0 3132; SSE-NEXT: movdqa %xmm0, (%rax) 3133; SSE-NEXT: retq 3134; 3135; AVX-LABEL: shuffle_binops_with_undef: 3136; AVX: # %bb.0: # %entry 3137; AVX-NEXT: vmovdqa (%rax), %xmm0 3138; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm0 3139; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 3140; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 3141; AVX-NEXT: vmovdqa %xmm0, (%rax) 3142; AVX-NEXT: retq 3143entry: 3144 %load0 = load <8 x i16>, ptr undef, align 16 3145 %load1 = load <8 x i16>, ptr undef, align 16 3146 %shuf0 = shufflevector <16 x i8> undef, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 3147 %addi = add <8 x i16> %load0, %load1 3148 %bc0 = bitcast <8 x i16> %addi to <2 x i64> 3149 %bc1 = bitcast <16 x i8> %shuf0 to <8 x i16> 3150 %shuf1 = shufflevector <8 x i16> %load1, <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef> 3151 %addi24 = add <8 x i16> %shuf1, %bc1 3152 %bc2 = bitcast <8 x i16> %addi24 to <2 x i64> 3153 %shuf2 = shufflevector <2 x i64> %bc0, <2 x i64> %bc2, <2 x i32> <i32 0, i32 2> 3154 %bc3 = bitcast <2 x i64> %shuf2 to <8 x i16> 3155 %psrli = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %bc3, i32 ptrtoint (ptr @shuffle_binops_with_undef to i32)) 3156 store <8 x i16> %psrli, ptr undef, align 16 3157 ret i32 undef 3158} 3159declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) 3160 3161define void @PR43024() { 3162; SSE2-LABEL: PR43024: 3163; SSE2: # %bb.0: 3164; SSE2-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3165; SSE2-NEXT: movaps %xmm0, (%rax) 3166; SSE2-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3167; SSE2-NEXT: xorps %xmm1, %xmm1 3168; SSE2-NEXT: addss %xmm1, %xmm0 3169; SSE2-NEXT: addss %xmm1, %xmm0 3170; SSE2-NEXT: movss %xmm0, (%rax) 3171; SSE2-NEXT: retq 3172; 3173; SSSE3-LABEL: PR43024: 3174; SSSE3: # %bb.0: 3175; SSSE3-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3176; SSSE3-NEXT: movaps %xmm0, (%rax) 3177; SSSE3-NEXT: addss %xmm0, %xmm0 3178; SSSE3-NEXT: xorps %xmm1, %xmm1 3179; SSSE3-NEXT: addss %xmm1, %xmm0 3180; SSSE3-NEXT: addss %xmm1, %xmm0 3181; SSSE3-NEXT: movss %xmm0, (%rax) 3182; SSSE3-NEXT: retq 3183; 3184; SSE41-LABEL: PR43024: 3185; SSE41: # %bb.0: 3186; SSE41-NEXT: movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3187; SSE41-NEXT: movaps %xmm0, (%rax) 3188; SSE41-NEXT: addss %xmm0, %xmm0 3189; SSE41-NEXT: xorps %xmm1, %xmm1 3190; SSE41-NEXT: addss %xmm1, %xmm0 3191; SSE41-NEXT: addss %xmm1, %xmm0 3192; SSE41-NEXT: movss %xmm0, (%rax) 3193; SSE41-NEXT: retq 3194; 3195; AVX-LABEL: PR43024: 3196; AVX: # %bb.0: 3197; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] 3198; AVX-NEXT: vmovaps %xmm0, (%rax) 3199; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0 3200; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 3201; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 3202; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}+12(%rip), %xmm0, %xmm0 3203; AVX-NEXT: vmovss %xmm0, (%rax) 3204; AVX-NEXT: retq 3205 store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, ptr undef, align 16 3206 %1 = load <4 x float>, ptr undef, align 16 3207 %2 = fmul <4 x float> %1, <float 0x0, float 0x0, float 0x0, float 0x0> 3208 %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> 3209 %4 = fadd <4 x float> %2, %3 3210 %5 = fadd <4 x float> zeroinitializer, %4 3211 %6 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 3, i32 undef, i32 undef, i32 undef> 3212 %7 = fadd <4 x float> %6, %5 3213 %8 = extractelement <4 x float> %7, i32 0 3214 store float %8, ptr undef, align 8 3215 ret void 3216} 3217 3218define void @PR45604(ptr %dst, ptr %src) { 3219; SSE2-LABEL: PR45604: 3220; SSE2: # %bb.0: 3221; SSE2-NEXT: movdqa (%rsi), %xmm0 3222; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] 3223; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 3224; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,0,65535,65535,65535] 3225; SSE2-NEXT: movdqa %xmm2, %xmm3 3226; SSE2-NEXT: pandn %xmm1, %xmm3 3227; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0] 3228; SSE2-NEXT: por %xmm1, %xmm3 3229; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1] 3230; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 3231; SSE2-NEXT: movdqa %xmm2, %xmm5 3232; SSE2-NEXT: pandn %xmm4, %xmm5 3233; SSE2-NEXT: por %xmm1, %xmm5 3234; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,2,2,2] 3235; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5] 3236; SSE2-NEXT: movdqa %xmm2, %xmm6 3237; SSE2-NEXT: pandn %xmm4, %xmm6 3238; SSE2-NEXT: por %xmm1, %xmm6 3239; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 3240; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 3241; SSE2-NEXT: pandn %xmm0, %xmm2 3242; SSE2-NEXT: por %xmm1, %xmm2 3243; SSE2-NEXT: movdqa %xmm2, 48(%rdi) 3244; SSE2-NEXT: movdqa %xmm6, 32(%rdi) 3245; SSE2-NEXT: movdqa %xmm5, 16(%rdi) 3246; SSE2-NEXT: movdqa %xmm3, (%rdi) 3247; SSE2-NEXT: retq 3248; 3249; SSSE3-LABEL: PR45604: 3250; SSSE3: # %bb.0: 3251; SSSE3-NEXT: movdqa (%rsi), %xmm0 3252; SSSE3-NEXT: movdqa %xmm0, %xmm1 3253; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,zero,zero,zero,zero,xmm1[2,3],zero,zero,zero,zero,zero,zero 3254; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,11,0,0,0,0,0,0,0,11,0,0,0] 3255; SSSE3-NEXT: por %xmm2, %xmm1 3256; SSSE3-NEXT: movdqa %xmm0, %xmm3 3257; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[4,5],zero,zero,zero,zero,zero,zero,xmm3[6,7],zero,zero,zero,zero,zero,zero 3258; SSSE3-NEXT: por %xmm2, %xmm3 3259; SSSE3-NEXT: movdqa %xmm0, %xmm4 3260; SSSE3-NEXT: pshufb {{.*#+}} xmm4 = xmm4[8,9],zero,zero,zero,zero,zero,zero,xmm4[10,11],zero,zero,zero,zero,zero,zero 3261; SSSE3-NEXT: por %xmm2, %xmm4 3262; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[12,13],zero,zero,zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero,zero,zero 3263; SSSE3-NEXT: por %xmm2, %xmm0 3264; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) 3265; SSSE3-NEXT: movdqa %xmm4, 32(%rdi) 3266; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) 3267; SSSE3-NEXT: movdqa %xmm1, (%rdi) 3268; SSSE3-NEXT: retq 3269; 3270; SSE41-LABEL: PR45604: 3271; SSE41: # %bb.0: 3272; SSE41-NEXT: movdqa (%rsi), %xmm0 3273; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] 3274; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3275; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <u,0,11,0,u,0,11,0> 3276; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7] 3277; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3278; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3279; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1,2,3],xmm3[4],xmm2[5,6,7] 3280; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,3,3,3] 3281; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero 3282; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2,3],xmm4[4],xmm2[5,6,7] 3283; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3284; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7] 3285; SSE41-NEXT: movdqa %xmm0, (%rdi) 3286; SSE41-NEXT: movdqa %xmm4, 48(%rdi) 3287; SSE41-NEXT: movdqa %xmm3, 32(%rdi) 3288; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 3289; SSE41-NEXT: retq 3290; 3291; AVX1-LABEL: PR45604: 3292; AVX1: # %bb.0: 3293; AVX1-NEXT: vmovdqa (%rsi), %xmm0 3294; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3] 3295; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero 3296; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [11,11,11,0,11,11,11,0] 3297; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] 3298; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 3299; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3300; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3301; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3302; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 3303; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 3304; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 3305; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 3306; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] 3307; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 3308; AVX1-NEXT: vmovups %ymm0, (%rdi) 3309; AVX1-NEXT: vmovups %ymm1, 32(%rdi) 3310; AVX1-NEXT: vzeroupper 3311; AVX1-NEXT: retq 3312; 3313; AVX2-LABEL: PR45604: 3314; AVX2: # %bb.0: 3315; AVX2-NEXT: vmovdqa (%rsi), %xmm0 3316; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2] 3317; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u> 3318; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 3319; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = <u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0,u,u,u,u,11,0,0,0> 3320; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7] 3321; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,1,3] 3322; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 3323; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] 3324; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 3325; AVX2-NEXT: vmovdqu %ymm1, (%rdi) 3326; AVX2-NEXT: vzeroupper 3327; AVX2-NEXT: retq 3328 %v1 = load <8 x i16>, ptr %src, align 16 3329 %v2 = shufflevector <8 x i16> %v1, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3330 %v3 = shufflevector <16 x i16> %v2, <16 x i16> <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31> 3331 store <32 x i16> %v3, ptr %dst, align 16 3332 ret void 3333} 3334 3335; getFauxShuffle AND/ANDN decoding wrongly assumed an undef src always gives an undef dst. 3336define <2 x i64> @PR55157(ptr %0) { 3337; SSE-LABEL: PR55157: 3338; SSE: # %bb.0: 3339; SSE-NEXT: xorps %xmm0, %xmm0 3340; SSE-NEXT: retq 3341; 3342; AVX-LABEL: PR55157: 3343; AVX: # %bb.0: 3344; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 3345; AVX-NEXT: retq 3346 %2 = load <16 x i8>, ptr %0, align 16 3347 %3 = icmp eq <16 x i8> %2, zeroinitializer 3348 %4 = tail call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> zeroinitializer, <16 x i8> zeroinitializer) 3349 %5 = select <16 x i1> %3, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> %4 3350 %6 = shufflevector <16 x i8> %5, <16 x i8> poison, <16 x i32> <i32 8, i32 8, i32 9, i32 9, i32 10, i32 10, i32 11, i32 11, i32 12, i32 12, i32 13, i32 13, i32 14, i32 14, i32 15, i32 15> 3351 %7 = bitcast <16 x i8> %6 to <2 x i64> 3352 ret <2 x i64> %7 3353} 3354declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) 3355 3356; SelectionDAG::isSplatValue - incorrect handling of undef sub-elements 3357define <2 x i64> @PR56520(<16 x i8> %0) { 3358; SSE-LABEL: PR56520: 3359; SSE: # %bb.0: 3360; SSE-NEXT: pxor %xmm1, %xmm1 3361; SSE-NEXT: pcmpeqb %xmm0, %xmm1 3362; SSE-NEXT: movd %xmm1, %eax 3363; SSE-NEXT: movsbl %al, %eax 3364; SSE-NEXT: movd %eax, %xmm0 3365; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3366; SSE-NEXT: retq 3367; 3368; AVX1-LABEL: PR56520: 3369; AVX1: # %bb.0: 3370; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 3371; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3372; AVX1-NEXT: vmovd %xmm0, %eax 3373; AVX1-NEXT: movsbl %al, %eax 3374; AVX1-NEXT: vmovd %eax, %xmm0 3375; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 3376; AVX1-NEXT: retq 3377; 3378; AVX2-SLOW-LABEL: PR56520: 3379; AVX2-SLOW: # %bb.0: 3380; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3381; AVX2-SLOW-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3382; AVX2-SLOW-NEXT: vmovd %xmm0, %eax 3383; AVX2-SLOW-NEXT: movsbl %al, %eax 3384; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 3385; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 3386; AVX2-SLOW-NEXT: retq 3387; 3388; AVX2-FAST-LABEL: PR56520: 3389; AVX2-FAST: # %bb.0: 3390; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 3391; AVX2-FAST-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 3392; AVX2-FAST-NEXT: vmovd %xmm0, %eax 3393; AVX2-FAST-NEXT: movsbl %al, %eax 3394; AVX2-FAST-NEXT: vmovd %eax, %xmm0 3395; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero 3396; AVX2-FAST-NEXT: retq 3397 %2 = icmp eq <16 x i8> zeroinitializer, %0 3398 %3 = extractelement <16 x i1> %2, i64 0 3399 %4 = sext i1 %3 to i32 3400 %5 = insertelement <2 x i32> zeroinitializer, i32 %4, i64 0 3401 %6 = zext <2 x i32> %5 to <2 x i64> 3402 %7 = shufflevector <2 x i64> %6, <2 x i64> zeroinitializer, <2 x i32> zeroinitializer 3403 ret <2 x i64> %7 3404} 3405 3406; Test case reported on D105827 3407define void @SpinningCube() { 3408; SSE2-LABEL: SpinningCube: 3409; SSE2: # %bb.0: # %entry 3410; SSE2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3411; SSE2-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3412; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3413; SSE2-NEXT: movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u> 3414; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3415; SSE2-NEXT: xorps %xmm3, %xmm3 3416; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 3417; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 3418; SSE2-NEXT: addps %xmm3, %xmm1 3419; SSE2-NEXT: movaps %xmm1, (%rax) 3420; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3421; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 3422; SSE2-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3423; SSE2-NEXT: addps %xmm0, %xmm1 3424; SSE2-NEXT: movaps %xmm1, (%rax) 3425; SSE2-NEXT: retq 3426; 3427; SSSE3-LABEL: SpinningCube: 3428; SSSE3: # %bb.0: # %entry 3429; SSSE3-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3430; SSSE3-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3431; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3432; SSSE3-NEXT: movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u> 3433; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] 3434; SSSE3-NEXT: xorps %xmm3, %xmm3 3435; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 3436; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,3] 3437; SSSE3-NEXT: addps %xmm3, %xmm1 3438; SSSE3-NEXT: movaps %xmm1, (%rax) 3439; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 3440; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,2] 3441; SSSE3-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 3442; SSSE3-NEXT: addps %xmm0, %xmm1 3443; SSSE3-NEXT: movaps %xmm1, (%rax) 3444; SSSE3-NEXT: retq 3445; 3446; SSE41-LABEL: SpinningCube: 3447; SSE41: # %bb.0: # %entry 3448; SSE41-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3449; SSE41-NEXT: movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3450; SSE41-NEXT: movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> 3451; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3452; SSE41-NEXT: movaps %xmm1, %xmm3 3453; SSE41-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0] 3454; SSE41-NEXT: movaps %xmm0, %xmm4 3455; SSE41-NEXT: insertps {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[2,3] 3456; SSE41-NEXT: addps %xmm3, %xmm4 3457; SSE41-NEXT: movaps %xmm4, (%rax) 3458; SSE41-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3459; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,0,2] 3460; SSE41-NEXT: mulps %xmm1, %xmm2 3461; SSE41-NEXT: addps %xmm0, %xmm2 3462; SSE41-NEXT: movaps %xmm2, (%rax) 3463; SSE41-NEXT: retq 3464; 3465; AVX1-LABEL: SpinningCube: 3466; AVX1: # %bb.0: # %entry 3467; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3468; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = <u,u,u,1.0E+0> 3469; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> 3470; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3471; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] 3472; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] 3473; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2 3474; AVX1-NEXT: vmovaps %xmm2, (%rax) 3475; AVX1-NEXT: vbroadcastss (%rax), %xmm2 3476; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 3477; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 3478; AVX1-NEXT: vmovaps %xmm0, (%rax) 3479; AVX1-NEXT: retq 3480; 3481; AVX2-LABEL: SpinningCube: 3482; AVX2: # %bb.0: # %entry 3483; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 3484; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] 3485; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> 3486; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3487; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] 3488; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] 3489; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2 3490; AVX2-NEXT: vmovaps %xmm2, (%rax) 3491; AVX2-NEXT: vbroadcastss (%rax), %xmm2 3492; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 3493; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 3494; AVX2-NEXT: vmovaps %xmm0, (%rax) 3495; AVX2-NEXT: retq 3496entry: 3497 store float 1.000000e+00, ptr undef, align 4 3498 %0 = load float, ptr undef, align 4 3499 %1 = fmul float undef, 0.000000e+00 3500 %2 = insertelement <4 x float> poison, float %0, i32 3 3501 %3 = load float, ptr undef, align 4 3502 %4 = insertelement <2 x float> poison, float %3, i32 0 3503 %5 = shufflevector <2 x float> %4, <2 x float> poison, <2 x i32> zeroinitializer 3504 %6 = fmul <2 x float> %5, <float 0.000000e+00, float -2.000000e+00> 3505 %7 = fadd float %1, undef 3506 %8 = shufflevector <2 x float> %6, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3507 %9 = shufflevector <4 x float> undef, <4 x float> %8, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3508 %10 = insertelement <4 x float> %9, float %7, i32 3 3509 %11 = insertelement <4 x float> %2, float 0x7FF8000000000000, i32 1 3510 %12 = insertelement <4 x float> %11, float undef, i32 0 3511 %13 = insertelement <4 x float> %12, float undef, i32 2 3512 %14 = fadd <4 x float> %10, %13 3513 store <4 x float> %14, ptr undef, align 16 3514 %15 = load float, ptr undef, align 4 3515 %16 = insertelement <2 x float> poison, float %15, i32 0 3516 %17 = shufflevector <2 x float> %16, <2 x float> poison, <2 x i32> zeroinitializer 3517 %18 = fmul <2 x float> %17, <float 0.000000e+00, float -2.000000e+00> 3518 %19 = shufflevector <2 x float> %18, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef> 3519 %20 = shufflevector <4 x float> undef, <4 x float> %19, <4 x i32> <i32 0, i32 4, i32 5, i32 undef> 3520 %21 = fadd <4 x float> %20, %2 3521 store <4 x float> %21, ptr undef, align 16 3522 ret void 3523} 3524