1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2 5 6 7; Verify that the following shifts are lowered into a sequence of two shifts plus 8; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic 9; packed shift right by a constant build_vector the backend should always try to 10; emit a simpler sequence of two shifts + blend when possible. 11 12define <8 x i16> @test1(<8 x i16> %a) { 13; SSE-LABEL: test1: 14; SSE: # BB#0: 15; SSE-NEXT: movdqa %xmm0, %xmm1 16; SSE-NEXT: psrlw $2, %xmm1 17; SSE-NEXT: psrlw $3, %xmm0 18; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 19; SSE-NEXT: movaps %xmm1, %xmm0 20; SSE-NEXT: retq 21; 22; AVX1-LABEL: test1: 23; AVX1: # BB#0: 24; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 25; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0 26; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 27; AVX1-NEXT: retq 28; 29; AVX2-LABEL: test1: 30; AVX2: # BB#0: 31; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm1 32; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm0 33; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 34; AVX2-NEXT: retq 35 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 36 ret <8 x i16> %lshr 37} 38 39define <8 x i16> @test2(<8 x i16> %a) { 40; SSE-LABEL: test2: 41; SSE: # BB#0: 42; SSE-NEXT: movdqa %xmm0, %xmm1 43; SSE-NEXT: psrlw $2, %xmm1 44; SSE-NEXT: psrlw $3, %xmm0 45; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 46; SSE-NEXT: movapd %xmm1, %xmm0 47; SSE-NEXT: retq 48; 49; AVX1-LABEL: test2: 50; AVX1: # BB#0: 51; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 52; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 53; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 54; AVX1-NEXT: retq 55; 56; AVX2-LABEL: test2: 57; AVX2: # BB#0: 58; AVX2-NEXT: vpsrlw $2, %xmm0, %xmm1 59; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 60; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 61; AVX2-NEXT: retq 62 %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 63 ret <8 x i16> %lshr 64} 65 66define <4 x i32> @test3(<4 x i32> %a) { 67; SSE-LABEL: test3: 68; SSE: # BB#0: 69; SSE-NEXT: movdqa %xmm0, %xmm1 70; SSE-NEXT: psrld $2, %xmm1 71; SSE-NEXT: psrld $3, %xmm0 72; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 73; SSE-NEXT: movaps %xmm1, %xmm0 74; SSE-NEXT: retq 75; 76; AVX1-LABEL: test3: 77; AVX1: # BB#0: 78; AVX1-NEXT: vpsrld $3, %xmm0, %xmm1 79; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0 80; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 81; AVX1-NEXT: retq 82; 83; AVX2-LABEL: test3: 84; AVX2: # BB#0: 85; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 86; AVX2-NEXT: retq 87 %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 88 ret <4 x i32> %lshr 89} 90 91define <4 x i32> @test4(<4 x i32> %a) { 92; SSE-LABEL: test4: 93; SSE: # BB#0: 94; SSE-NEXT: movdqa %xmm0, %xmm1 95; SSE-NEXT: psrld $2, %xmm1 96; SSE-NEXT: psrld $3, %xmm0 97; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 98; SSE-NEXT: movapd %xmm1, %xmm0 99; SSE-NEXT: retq 100; 101; AVX1-LABEL: test4: 102; AVX1: # BB#0: 103; AVX1-NEXT: vpsrld $2, %xmm0, %xmm1 104; AVX1-NEXT: vpsrld $3, %xmm0, %xmm0 105; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 106; AVX1-NEXT: retq 107; 108; AVX2-LABEL: test4: 109; AVX2: # BB#0: 110; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 111; AVX2-NEXT: retq 112 %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 113 ret <4 x i32> %lshr 114} 115 116define <8 x i16> @test5(<8 x i16> %a) { 117; SSE-LABEL: test5: 118; SSE: # BB#0: 119; SSE-NEXT: movdqa %xmm0, %xmm1 120; SSE-NEXT: psraw $2, %xmm1 121; SSE-NEXT: psraw $3, %xmm0 122; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 123; SSE-NEXT: movaps %xmm1, %xmm0 124; SSE-NEXT: retq 125; 126; AVX1-LABEL: test5: 127; AVX1: # BB#0: 128; AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 129; AVX1-NEXT: vpsraw $2, %xmm0, %xmm0 130; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 131; AVX1-NEXT: retq 132; 133; AVX2-LABEL: test5: 134; AVX2: # BB#0: 135; AVX2-NEXT: vpsraw $3, %xmm0, %xmm1 136; AVX2-NEXT: vpsraw $2, %xmm0, %xmm0 137; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 138; AVX2-NEXT: retq 139 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 140 ret <8 x i16> %lshr 141} 142 143define <8 x i16> @test6(<8 x i16> %a) { 144; SSE-LABEL: test6: 145; SSE: # BB#0: 146; SSE-NEXT: movdqa %xmm0, %xmm1 147; SSE-NEXT: psraw $2, %xmm1 148; SSE-NEXT: psraw $3, %xmm0 149; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 150; SSE-NEXT: movapd %xmm1, %xmm0 151; SSE-NEXT: retq 152; 153; AVX1-LABEL: test6: 154; AVX1: # BB#0: 155; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 156; AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 157; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 158; AVX1-NEXT: retq 159; 160; AVX2-LABEL: test6: 161; AVX2: # BB#0: 162; AVX2-NEXT: vpsraw $2, %xmm0, %xmm1 163; AVX2-NEXT: vpsraw $3, %xmm0, %xmm0 164; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 165; AVX2-NEXT: retq 166 %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2> 167 ret <8 x i16> %lshr 168} 169 170define <4 x i32> @test7(<4 x i32> %a) { 171; SSE-LABEL: test7: 172; SSE: # BB#0: 173; SSE-NEXT: movdqa %xmm0, %xmm1 174; SSE-NEXT: psrad $2, %xmm1 175; SSE-NEXT: psrad $3, %xmm0 176; SSE-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 177; SSE-NEXT: movaps %xmm1, %xmm0 178; SSE-NEXT: retq 179; 180; AVX1-LABEL: test7: 181; AVX1: # BB#0: 182; AVX1-NEXT: vpsrad $3, %xmm0, %xmm1 183; AVX1-NEXT: vpsrad $2, %xmm0, %xmm0 184; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] 185; AVX1-NEXT: retq 186; 187; AVX2-LABEL: test7: 188; AVX2: # BB#0: 189; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 190; AVX2-NEXT: retq 191 %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2> 192 ret <4 x i32> %lshr 193} 194 195define <4 x i32> @test8(<4 x i32> %a) { 196; SSE-LABEL: test8: 197; SSE: # BB#0: 198; SSE-NEXT: movdqa %xmm0, %xmm1 199; SSE-NEXT: psrad $2, %xmm1 200; SSE-NEXT: psrad $3, %xmm0 201; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] 202; SSE-NEXT: movapd %xmm1, %xmm0 203; SSE-NEXT: retq 204; 205; AVX1-LABEL: test8: 206; AVX1: # BB#0: 207; AVX1-NEXT: vpsrad $2, %xmm0, %xmm1 208; AVX1-NEXT: vpsrad $3, %xmm0, %xmm0 209; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 210; AVX1-NEXT: retq 211; 212; AVX2-LABEL: test8: 213; AVX2: # BB#0: 214; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 215; AVX2-NEXT: retq 216 %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2> 217 ret <4 x i32> %lshr 218} 219