1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7; Verify that we don't scalarize a packed vector shift left of 16-bit 8; signed integers if the amount is a constant build_vector. 9; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 10 11define <8 x i16> @test1(<8 x i16> %a) { 12; SSE-LABEL: test1: 13; SSE: # %bb.0: 14; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 15; SSE-NEXT: retq 16; 17; AVX-LABEL: test1: 18; AVX: # %bb.0: 19; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 20; AVX-NEXT: retq 21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 22 ret <8 x i16> %shl 23} 24 25define <8 x i16> @test2(<8 x i16> %a) { 26; SSE-LABEL: test2: 27; SSE: # %bb.0: 28; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 29; SSE-NEXT: retq 30; 31; AVX-LABEL: test2: 32; AVX: # %bb.0: 33; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 34; AVX-NEXT: retq 35 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 36 ret <8 x i16> %shl 37} 38 39; Verify that a vector shift left of 32-bit signed integers is simply expanded 40; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 41; counts is a constant build_vector. 42 43define <4 x i32> @test3(<4 x i32> %a) { 44; SSE2-LABEL: test3: 45; SSE2: # %bb.0: 46; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 47; SSE2-NEXT: pmuludq %xmm0, %xmm1 48; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 49; SSE2-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 50; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 51; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 52; SSE2-NEXT: retq 53; 54; SSE41-LABEL: test3: 55; SSE41: # %bb.0: 56; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 57; SSE41-NEXT: retq 58; 59; AVX-LABEL: test3: 60; AVX: # %bb.0: 61; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 62; AVX-NEXT: retq 63 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 64 ret <4 x i32> %shl 65} 66 67define <4 x i32> @test4(<4 x i32> %a) { 68; SSE2-LABEL: test4: 69; SSE2: # %bb.0: 70; SSE2-NEXT: movdqa %xmm0, %xmm1 71; SSE2-NEXT: pslld $1, %xmm1 72; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 73; SSE2-NEXT: retq 74; 75; SSE41-LABEL: test4: 76; SSE41: # %bb.0: 77; SSE41-NEXT: movdqa %xmm0, %xmm1 78; SSE41-NEXT: pslld $1, %xmm1 79; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 80; SSE41-NEXT: retq 81; 82; AVX-LABEL: test4: 83; AVX: # %bb.0: 84; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 85; AVX-NEXT: retq 86 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 87 ret <4 x i32> %shl 88} 89 90; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 91; into two pmullw instructions. With AVX2, the test case below would produce 92; a single vpmullw. 93 94define <16 x i16> @test5(<16 x i16> %a) { 95; SSE-LABEL: test5: 96; SSE: # %bb.0: 97; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] 98; SSE-NEXT: pmullw %xmm2, %xmm0 99; SSE-NEXT: pmullw %xmm2, %xmm1 100; SSE-NEXT: retq 101; 102; AVX-LABEL: test5: 103; AVX: # %bb.0: 104; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 105; AVX-NEXT: retq 106 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 107 ret <16 x i16> %shl 108} 109 110; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 111; into two pmulld instructions. With AVX2, the test case below would produce 112; a single vpsllvd instead. 113 114define <8 x i32> @test6(<8 x i32> %a) { 115; SSE2-LABEL: test6: 116; SSE2: # %bb.0: 117; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 118; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 119; SSE2-NEXT: pmuludq %xmm2, %xmm0 120; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 121; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,8,8] 122; SSE2-NEXT: pmuludq %xmm4, %xmm3 123; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 124; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 125; SSE2-NEXT: pmuludq %xmm1, %xmm2 126; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 127; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 128; SSE2-NEXT: pmuludq %xmm4, %xmm1 129; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 130; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 131; SSE2-NEXT: movdqa %xmm2, %xmm1 132; SSE2-NEXT: retq 133; 134; SSE41-LABEL: test6: 135; SSE41: # %bb.0: 136; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 137; SSE41-NEXT: pmulld %xmm2, %xmm0 138; SSE41-NEXT: pmulld %xmm2, %xmm1 139; SSE41-NEXT: retq 140; 141; AVX-LABEL: test6: 142; AVX: # %bb.0: 143; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 144; AVX-NEXT: retq 145 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 146 ret <8 x i32> %shl 147} 148 149; With AVX2 and AVX512, the test case below should produce a sequence of 150; two vpmullw instructions. On SSE2 instead, we split the shift in four 151; parts and then we convert each part into a pmullw. 152 153define <32 x i16> @test7(<32 x i16> %a) { 154; SSE-LABEL: test7: 155; SSE: # %bb.0: 156; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] 157; SSE-NEXT: pmullw %xmm4, %xmm0 158; SSE-NEXT: pmullw %xmm4, %xmm1 159; SSE-NEXT: pmullw %xmm4, %xmm2 160; SSE-NEXT: pmullw %xmm4, %xmm3 161; SSE-NEXT: retq 162; 163; AVX2-LABEL: test7: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 166; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 167; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 168; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 169; AVX2-NEXT: retq 170; 171; AVX512-LABEL: test7: 172; AVX512: # %bb.0: 173; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 174; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 175; AVX512-NEXT: # ymm2 = mem[0,1,0,1] 176; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 177; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 178; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 179; AVX512-NEXT: retq 180 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 181 ret <32 x i16> %shl 182} 183 184; Similar to test7; the difference is that with AVX512 support 185; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 186 187define <16 x i32> @test8(<16 x i32> %a) { 188; SSE2-LABEL: test8: 189; SSE2: # %bb.0: 190; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 191; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 192; SSE2-NEXT: pmuludq %xmm4, %xmm0 193; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 194; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2,2,8,8] 195; SSE2-NEXT: pmuludq %xmm6, %xmm5 196; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 197; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 198; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 199; SSE2-NEXT: pmuludq %xmm4, %xmm1 200; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 201; SSE2-NEXT: pmuludq %xmm6, %xmm5 202; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 203; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 204; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 205; SSE2-NEXT: pmuludq %xmm4, %xmm2 206; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 207; SSE2-NEXT: pmuludq %xmm6, %xmm5 208; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 209; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 210; SSE2-NEXT: pmuludq %xmm3, %xmm4 211; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 212; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 213; SSE2-NEXT: pmuludq %xmm6, %xmm3 214; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 215; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 216; SSE2-NEXT: movdqa %xmm4, %xmm3 217; SSE2-NEXT: retq 218; 219; SSE41-LABEL: test8: 220; SSE41: # %bb.0: 221; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 222; SSE41-NEXT: pmulld %xmm4, %xmm0 223; SSE41-NEXT: pmulld %xmm4, %xmm1 224; SSE41-NEXT: pmulld %xmm4, %xmm2 225; SSE41-NEXT: pmulld %xmm4, %xmm3 226; SSE41-NEXT: retq 227; 228; AVX2-LABEL: test8: 229; AVX2: # %bb.0: 230; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] 231; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 232; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 233; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 234; AVX2-NEXT: retq 235; 236; AVX512-LABEL: test8: 237; AVX512: # %bb.0: 238; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 239; AVX512-NEXT: retq 240 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 241 ret <16 x i32> %shl 242} 243 244; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. 245 246define <8 x i64> @test9(<8 x i64> %a) { 247; SSE2-LABEL: test9: 248; SSE2: # %bb.0: 249; SSE2-NEXT: movdqa %xmm1, %xmm4 250; SSE2-NEXT: psllq $2, %xmm4 251; SSE2-NEXT: psllq $3, %xmm1 252; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 253; SSE2-NEXT: movdqa %xmm3, %xmm4 254; SSE2-NEXT: psllq $2, %xmm4 255; SSE2-NEXT: psllq $3, %xmm3 256; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 257; SSE2-NEXT: paddq %xmm0, %xmm0 258; SSE2-NEXT: paddq %xmm2, %xmm2 259; SSE2-NEXT: retq 260; 261; SSE41-LABEL: test9: 262; SSE41: # %bb.0: 263; SSE41-NEXT: movdqa %xmm1, %xmm4 264; SSE41-NEXT: psllq $3, %xmm4 265; SSE41-NEXT: psllq $2, %xmm1 266; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 267; SSE41-NEXT: movdqa %xmm3, %xmm4 268; SSE41-NEXT: psllq $3, %xmm4 269; SSE41-NEXT: psllq $2, %xmm3 270; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 271; SSE41-NEXT: paddq %xmm0, %xmm0 272; SSE41-NEXT: paddq %xmm2, %xmm2 273; SSE41-NEXT: retq 274; 275; AVX2-LABEL: test9: 276; AVX2: # %bb.0: 277; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3] 278; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 279; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 280; AVX2-NEXT: retq 281; 282; AVX512-LABEL: test9: 283; AVX512: # %bb.0: 284; AVX512-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 285; AVX512-NEXT: retq 286 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 287 ret <8 x i64> %shl 288} 289