1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 16 17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { 18; SSE-LABEL: trunc8i64_8i32: 19; SSE: # %bb.0: # %entry 20; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 21; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 22; SSE-NEXT: movaps %xmm2, %xmm1 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc8i64_8i32: 26; AVX1: # %bb.0: # %entry 27; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 28; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 29; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 30; AVX1-NEXT: retq 31; 32; AVX2-SLOW-LABEL: trunc8i64_8i32: 33; AVX2-SLOW: # %bb.0: # %entry 34; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 36; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 37; AVX2-SLOW-NEXT: retq 38; 39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32: 40; AVX2-FAST-ALL: # %bb.0: # %entry 41; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 42; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 43; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 44; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 45; AVX2-FAST-ALL-NEXT: retq 46; 47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32: 48; AVX2-FAST-PERLANE: # %bb.0: # %entry 49; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 50; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 51; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 52; AVX2-FAST-PERLANE-NEXT: retq 53; 54; AVX512-LABEL: trunc8i64_8i32: 55; AVX512: # %bb.0: # %entry 56; AVX512-NEXT: vpmovqd %zmm0, %ymm0 57; AVX512-NEXT: retq 58entry: 59 %0 = trunc <8 x i64> %a to <8 x i32> 60 ret <8 x i32> %0 61} 62 63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { 64; SSE-LABEL: trunc8i64_8i32_ashr: 65; SSE: # %bb.0: # %entry 66; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 67; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 68; SSE-NEXT: movaps %xmm2, %xmm1 69; SSE-NEXT: retq 70; 71; AVX1-LABEL: trunc8i64_8i32_ashr: 72; AVX1: # %bb.0: # %entry 73; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 74; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 75; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 76; AVX1-NEXT: retq 77; 78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: 79; AVX2-SLOW: # %bb.0: # %entry 80; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 81; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 82; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 83; AVX2-SLOW-NEXT: retq 84; 85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: 86; AVX2-FAST-ALL: # %bb.0: # %entry 87; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 88; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 89; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 90; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 91; AVX2-FAST-ALL-NEXT: retq 92; 93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr: 94; AVX2-FAST-PERLANE: # %bb.0: # %entry 95; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 96; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 97; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 98; AVX2-FAST-PERLANE-NEXT: retq 99; 100; AVX512-LABEL: trunc8i64_8i32_ashr: 101; AVX512: # %bb.0: # %entry 102; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 103; AVX512-NEXT: vpmovqd %zmm0, %ymm0 104; AVX512-NEXT: retq 105entry: 106 %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 107 %1 = trunc <8 x i64> %0 to <8 x i32> 108 ret <8 x i32> %1 109} 110 111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { 112; SSE-LABEL: trunc8i64_8i32_lshr: 113; SSE: # %bb.0: # %entry 114; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 115; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 116; SSE-NEXT: movaps %xmm2, %xmm1 117; SSE-NEXT: retq 118; 119; AVX1-LABEL: trunc8i64_8i32_lshr: 120; AVX1: # %bb.0: # %entry 121; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 122; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 123; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 124; AVX1-NEXT: retq 125; 126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: 127; AVX2-SLOW: # %bb.0: # %entry 128; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 129; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 130; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 131; AVX2-SLOW-NEXT: retq 132; 133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: 134; AVX2-FAST-ALL: # %bb.0: # %entry 135; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 136; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 137; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 138; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 139; AVX2-FAST-ALL-NEXT: retq 140; 141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr: 142; AVX2-FAST-PERLANE: # %bb.0: # %entry 143; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 144; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 145; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 146; AVX2-FAST-PERLANE-NEXT: retq 147; 148; AVX512-LABEL: trunc8i64_8i32_lshr: 149; AVX512: # %bb.0: # %entry 150; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 151; AVX512-NEXT: vpmovqd %zmm0, %ymm0 152; AVX512-NEXT: retq 153entry: 154 %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 155 %1 = trunc <8 x i64> %0 to <8 x i32> 156 ret <8 x i32> %1 157} 158 159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { 160; SSE2-LABEL: trunc8i64_8i16: 161; SSE2: # %bb.0: # %entry 162; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 163; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 164; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 165; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 166; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 167; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 168; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 169; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 170; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 171; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 172; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 173; SSE2-NEXT: retq 174; 175; SSSE3-LABEL: trunc8i64_8i16: 176; SSSE3: # %bb.0: # %entry 177; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 178; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 179; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 180; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 181; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 182; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 183; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 184; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 185; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 186; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 187; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 188; SSSE3-NEXT: retq 189; 190; SSE41-LABEL: trunc8i64_8i16: 191; SSE41: # %bb.0: # %entry 192; SSE41-NEXT: pxor %xmm4, %xmm4 193; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 194; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 195; SSE41-NEXT: packusdw %xmm3, %xmm2 196; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 197; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 198; SSE41-NEXT: packusdw %xmm1, %xmm0 199; SSE41-NEXT: packusdw %xmm2, %xmm0 200; SSE41-NEXT: retq 201; 202; AVX1-LABEL: trunc8i64_8i16: 203; AVX1: # %bb.0: # %entry 204; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 205; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 206; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 207; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 208; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 209; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 210; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 211; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 212; AVX1-NEXT: vzeroupper 213; AVX1-NEXT: retq 214; 215; AVX2-LABEL: trunc8i64_8i16: 216; AVX2: # %bb.0: # %entry 217; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 218; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 219; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 220; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 221; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 222; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 223; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 224; AVX2-NEXT: vzeroupper 225; AVX2-NEXT: retq 226; 227; AVX512-LABEL: trunc8i64_8i16: 228; AVX512: # %bb.0: # %entry 229; AVX512-NEXT: vpmovqw %zmm0, %xmm0 230; AVX512-NEXT: vzeroupper 231; AVX512-NEXT: retq 232entry: 233 %0 = trunc <8 x i64> %a to <8 x i16> 234 ret <8 x i16> %0 235} 236 237define void @trunc8i64_8i8(<8 x i64> %a) { 238; SSE2-LABEL: trunc8i64_8i8: 239; SSE2: # %bb.0: # %entry 240; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 241; SSE2-NEXT: pand %xmm4, %xmm3 242; SSE2-NEXT: pand %xmm4, %xmm2 243; SSE2-NEXT: packuswb %xmm3, %xmm2 244; SSE2-NEXT: pand %xmm4, %xmm1 245; SSE2-NEXT: pand %xmm4, %xmm0 246; SSE2-NEXT: packuswb %xmm1, %xmm0 247; SSE2-NEXT: packuswb %xmm2, %xmm0 248; SSE2-NEXT: packuswb %xmm0, %xmm0 249; SSE2-NEXT: movq %xmm0, (%rax) 250; SSE2-NEXT: retq 251; 252; SSSE3-LABEL: trunc8i64_8i8: 253; SSSE3: # %bb.0: # %entry 254; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 255; SSSE3-NEXT: pand %xmm4, %xmm3 256; SSSE3-NEXT: pand %xmm4, %xmm2 257; SSSE3-NEXT: packuswb %xmm3, %xmm2 258; SSSE3-NEXT: pand %xmm4, %xmm1 259; SSSE3-NEXT: pand %xmm4, %xmm0 260; SSSE3-NEXT: packuswb %xmm1, %xmm0 261; SSSE3-NEXT: packuswb %xmm2, %xmm0 262; SSSE3-NEXT: packuswb %xmm0, %xmm0 263; SSSE3-NEXT: movq %xmm0, (%rax) 264; SSSE3-NEXT: retq 265; 266; SSE41-LABEL: trunc8i64_8i8: 267; SSE41: # %bb.0: # %entry 268; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 269; SSE41-NEXT: pand %xmm4, %xmm3 270; SSE41-NEXT: pand %xmm4, %xmm2 271; SSE41-NEXT: packusdw %xmm3, %xmm2 272; SSE41-NEXT: pand %xmm4, %xmm1 273; SSE41-NEXT: pand %xmm4, %xmm0 274; SSE41-NEXT: packusdw %xmm1, %xmm0 275; SSE41-NEXT: packusdw %xmm2, %xmm0 276; SSE41-NEXT: packuswb %xmm0, %xmm0 277; SSE41-NEXT: movq %xmm0, (%rax) 278; SSE41-NEXT: retq 279; 280; AVX1-LABEL: trunc8i64_8i8: 281; AVX1: # %bb.0: # %entry 282; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] 283; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 284; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 285; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 286; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 287; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 288; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 289; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 290; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 291; AVX1-NEXT: vmovq %xmm0, (%rax) 292; AVX1-NEXT: vzeroupper 293; AVX1-NEXT: retq 294; 295; AVX2-LABEL: trunc8i64_8i8: 296; AVX2: # %bb.0: # %entry 297; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 298; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 299; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 300; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 302; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 303; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 304; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 305; AVX2-NEXT: vmovq %xmm0, (%rax) 306; AVX2-NEXT: vzeroupper 307; AVX2-NEXT: retq 308; 309; AVX512-LABEL: trunc8i64_8i8: 310; AVX512: # %bb.0: # %entry 311; AVX512-NEXT: vpmovqb %zmm0, (%rax) 312; AVX512-NEXT: vzeroupper 313; AVX512-NEXT: retq 314entry: 315 %0 = trunc <8 x i64> %a to <8 x i8> 316 store <8 x i8> %0, <8 x i8>* undef, align 4 317 ret void 318} 319 320define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { 321; SSE2-LABEL: trunc8i32_8i16: 322; SSE2: # %bb.0: # %entry 323; SSE2-NEXT: pslld $16, %xmm1 324; SSE2-NEXT: psrad $16, %xmm1 325; SSE2-NEXT: pslld $16, %xmm0 326; SSE2-NEXT: psrad $16, %xmm0 327; SSE2-NEXT: packssdw %xmm1, %xmm0 328; SSE2-NEXT: retq 329; 330; SSSE3-LABEL: trunc8i32_8i16: 331; SSSE3: # %bb.0: # %entry 332; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 333; SSSE3-NEXT: pshufb %xmm2, %xmm1 334; SSSE3-NEXT: pshufb %xmm2, %xmm0 335; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 336; SSSE3-NEXT: retq 337; 338; SSE41-LABEL: trunc8i32_8i16: 339; SSE41: # %bb.0: # %entry 340; SSE41-NEXT: pxor %xmm2, %xmm2 341; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 342; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 343; SSE41-NEXT: packusdw %xmm1, %xmm0 344; SSE41-NEXT: retq 345; 346; AVX1-LABEL: trunc8i32_8i16: 347; AVX1: # %bb.0: # %entry 348; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 349; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 350; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 351; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 352; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 353; AVX1-NEXT: vzeroupper 354; AVX1-NEXT: retq 355; 356; AVX2-LABEL: trunc8i32_8i16: 357; AVX2: # %bb.0: # %entry 358; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 359; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 360; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 361; AVX2-NEXT: vzeroupper 362; AVX2-NEXT: retq 363; 364; AVX512F-LABEL: trunc8i32_8i16: 365; AVX512F: # %bb.0: # %entry 366; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 367; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 368; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 369; AVX512F-NEXT: vzeroupper 370; AVX512F-NEXT: retq 371; 372; AVX512VL-LABEL: trunc8i32_8i16: 373; AVX512VL: # %bb.0: # %entry 374; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 375; AVX512VL-NEXT: vzeroupper 376; AVX512VL-NEXT: retq 377; 378; AVX512BW-LABEL: trunc8i32_8i16: 379; AVX512BW: # %bb.0: # %entry 380; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 381; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 382; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 383; AVX512BW-NEXT: vzeroupper 384; AVX512BW-NEXT: retq 385; 386; AVX512BWVL-LABEL: trunc8i32_8i16: 387; AVX512BWVL: # %bb.0: # %entry 388; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 389; AVX512BWVL-NEXT: vzeroupper 390; AVX512BWVL-NEXT: retq 391entry: 392 %0 = trunc <8 x i32> %a to <8 x i16> 393 ret <8 x i16> %0 394} 395 396define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { 397; SSE2-LABEL: trunc8i32_8i16_ashr: 398; SSE2: # %bb.0: # %entry 399; SSE2-NEXT: psrad $16, %xmm1 400; SSE2-NEXT: psrad $16, %xmm0 401; SSE2-NEXT: packssdw %xmm1, %xmm0 402; SSE2-NEXT: retq 403; 404; SSSE3-LABEL: trunc8i32_8i16_ashr: 405; SSSE3: # %bb.0: # %entry 406; SSSE3-NEXT: psrad $16, %xmm1 407; SSSE3-NEXT: psrad $16, %xmm0 408; SSSE3-NEXT: packssdw %xmm1, %xmm0 409; SSSE3-NEXT: retq 410; 411; SSE41-LABEL: trunc8i32_8i16_ashr: 412; SSE41: # %bb.0: # %entry 413; SSE41-NEXT: psrld $16, %xmm1 414; SSE41-NEXT: psrld $16, %xmm0 415; SSE41-NEXT: packusdw %xmm1, %xmm0 416; SSE41-NEXT: retq 417; 418; AVX1-LABEL: trunc8i32_8i16_ashr: 419; AVX1: # %bb.0: # %entry 420; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 421; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 422; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 423; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 424; AVX1-NEXT: vzeroupper 425; AVX1-NEXT: retq 426; 427; AVX2-LABEL: trunc8i32_8i16_ashr: 428; AVX2: # %bb.0: # %entry 429; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 430; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 431; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 432; AVX2-NEXT: vzeroupper 433; AVX2-NEXT: retq 434; 435; AVX512F-LABEL: trunc8i32_8i16_ashr: 436; AVX512F: # %bb.0: # %entry 437; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 438; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 439; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 440; AVX512F-NEXT: vzeroupper 441; AVX512F-NEXT: retq 442; 443; AVX512VL-LABEL: trunc8i32_8i16_ashr: 444; AVX512VL: # %bb.0: # %entry 445; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 446; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 447; AVX512VL-NEXT: vzeroupper 448; AVX512VL-NEXT: retq 449; 450; AVX512BW-LABEL: trunc8i32_8i16_ashr: 451; AVX512BW: # %bb.0: # %entry 452; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 453; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 454; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 455; AVX512BW-NEXT: vzeroupper 456; AVX512BW-NEXT: retq 457; 458; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: 459; AVX512BWVL: # %bb.0: # %entry 460; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 461; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 462; AVX512BWVL-NEXT: vzeroupper 463; AVX512BWVL-NEXT: retq 464entry: 465 %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 466 %1 = trunc <8 x i32> %0 to <8 x i16> 467 ret <8 x i16> %1 468} 469 470define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { 471; SSE2-LABEL: trunc8i32_8i16_lshr: 472; SSE2: # %bb.0: # %entry 473; SSE2-NEXT: psrad $16, %xmm1 474; SSE2-NEXT: psrad $16, %xmm0 475; SSE2-NEXT: packssdw %xmm1, %xmm0 476; SSE2-NEXT: retq 477; 478; SSSE3-LABEL: trunc8i32_8i16_lshr: 479; SSSE3: # %bb.0: # %entry 480; SSSE3-NEXT: psrad $16, %xmm1 481; SSSE3-NEXT: psrad $16, %xmm0 482; SSSE3-NEXT: packssdw %xmm1, %xmm0 483; SSSE3-NEXT: retq 484; 485; SSE41-LABEL: trunc8i32_8i16_lshr: 486; SSE41: # %bb.0: # %entry 487; SSE41-NEXT: psrld $16, %xmm1 488; SSE41-NEXT: psrld $16, %xmm0 489; SSE41-NEXT: packusdw %xmm1, %xmm0 490; SSE41-NEXT: retq 491; 492; AVX1-LABEL: trunc8i32_8i16_lshr: 493; AVX1: # %bb.0: # %entry 494; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 495; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 496; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 497; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 498; AVX1-NEXT: vzeroupper 499; AVX1-NEXT: retq 500; 501; AVX2-LABEL: trunc8i32_8i16_lshr: 502; AVX2: # %bb.0: # %entry 503; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 504; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 505; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 506; AVX2-NEXT: vzeroupper 507; AVX2-NEXT: retq 508; 509; AVX512F-LABEL: trunc8i32_8i16_lshr: 510; AVX512F: # %bb.0: # %entry 511; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 512; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 513; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 514; AVX512F-NEXT: vzeroupper 515; AVX512F-NEXT: retq 516; 517; AVX512VL-LABEL: trunc8i32_8i16_lshr: 518; AVX512VL: # %bb.0: # %entry 519; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 520; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 521; AVX512VL-NEXT: vzeroupper 522; AVX512VL-NEXT: retq 523; 524; AVX512BW-LABEL: trunc8i32_8i16_lshr: 525; AVX512BW: # %bb.0: # %entry 526; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 527; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 528; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 529; AVX512BW-NEXT: vzeroupper 530; AVX512BW-NEXT: retq 531; 532; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: 533; AVX512BWVL: # %bb.0: # %entry 534; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 535; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 536; AVX512BWVL-NEXT: vzeroupper 537; AVX512BWVL-NEXT: retq 538entry: 539 %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 540 %1 = trunc <8 x i32> %0 to <8 x i16> 541 ret <8 x i16> %1 542} 543 544define void @trunc8i32_8i8(<8 x i32> %a) { 545; SSE2-LABEL: trunc8i32_8i8: 546; SSE2: # %bb.0: # %entry 547; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 548; SSE2-NEXT: pand %xmm2, %xmm1 549; SSE2-NEXT: pand %xmm2, %xmm0 550; SSE2-NEXT: packuswb %xmm1, %xmm0 551; SSE2-NEXT: packuswb %xmm0, %xmm0 552; SSE2-NEXT: movq %xmm0, (%rax) 553; SSE2-NEXT: retq 554; 555; SSSE3-LABEL: trunc8i32_8i8: 556; SSSE3: # %bb.0: # %entry 557; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 558; SSSE3-NEXT: pshufb %xmm2, %xmm1 559; SSSE3-NEXT: pshufb %xmm2, %xmm0 560; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 561; SSSE3-NEXT: movq %xmm0, (%rax) 562; SSSE3-NEXT: retq 563; 564; SSE41-LABEL: trunc8i32_8i8: 565; SSE41: # %bb.0: # %entry 566; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 567; SSE41-NEXT: pshufb %xmm2, %xmm1 568; SSE41-NEXT: pshufb %xmm2, %xmm0 569; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 570; SSE41-NEXT: movq %xmm0, (%rax) 571; SSE41-NEXT: retq 572; 573; AVX1-LABEL: trunc8i32_8i8: 574; AVX1: # %bb.0: # %entry 575; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 576; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 577; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 578; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 579; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 580; AVX1-NEXT: vmovq %xmm0, (%rax) 581; AVX1-NEXT: vzeroupper 582; AVX1-NEXT: retq 583; 584; AVX2-LABEL: trunc8i32_8i8: 585; AVX2: # %bb.0: # %entry 586; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 587; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 588; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 589; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 590; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 591; AVX2-NEXT: vmovq %xmm0, (%rax) 592; AVX2-NEXT: vzeroupper 593; AVX2-NEXT: retq 594; 595; AVX512F-LABEL: trunc8i32_8i8: 596; AVX512F: # %bb.0: # %entry 597; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 598; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 599; AVX512F-NEXT: vmovq %xmm0, (%rax) 600; AVX512F-NEXT: vzeroupper 601; AVX512F-NEXT: retq 602; 603; AVX512VL-LABEL: trunc8i32_8i8: 604; AVX512VL: # %bb.0: # %entry 605; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) 606; AVX512VL-NEXT: vzeroupper 607; AVX512VL-NEXT: retq 608; 609; AVX512BW-LABEL: trunc8i32_8i8: 610; AVX512BW: # %bb.0: # %entry 611; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 612; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 613; AVX512BW-NEXT: vmovq %xmm0, (%rax) 614; AVX512BW-NEXT: vzeroupper 615; AVX512BW-NEXT: retq 616; 617; AVX512BWVL-LABEL: trunc8i32_8i8: 618; AVX512BWVL: # %bb.0: # %entry 619; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) 620; AVX512BWVL-NEXT: vzeroupper 621; AVX512BWVL-NEXT: retq 622entry: 623 %0 = trunc <8 x i32> %a to <8 x i8> 624 store <8 x i8> %0, <8 x i8>* undef, align 4 625 ret void 626} 627 628define void @trunc16i32_16i16(<16 x i32> %a) { 629; SSE2-LABEL: trunc16i32_16i16: 630; SSE2: # %bb.0: # %entry 631; SSE2-NEXT: pslld $16, %xmm1 632; SSE2-NEXT: psrad $16, %xmm1 633; SSE2-NEXT: pslld $16, %xmm0 634; SSE2-NEXT: psrad $16, %xmm0 635; SSE2-NEXT: packssdw %xmm1, %xmm0 636; SSE2-NEXT: pslld $16, %xmm3 637; SSE2-NEXT: psrad $16, %xmm3 638; SSE2-NEXT: pslld $16, %xmm2 639; SSE2-NEXT: psrad $16, %xmm2 640; SSE2-NEXT: packssdw %xmm3, %xmm2 641; SSE2-NEXT: movdqu %xmm2, (%rax) 642; SSE2-NEXT: movdqu %xmm0, (%rax) 643; SSE2-NEXT: retq 644; 645; SSSE3-LABEL: trunc16i32_16i16: 646; SSSE3: # %bb.0: # %entry 647; SSSE3-NEXT: pslld $16, %xmm1 648; SSSE3-NEXT: psrad $16, %xmm1 649; SSSE3-NEXT: pslld $16, %xmm0 650; SSSE3-NEXT: psrad $16, %xmm0 651; SSSE3-NEXT: packssdw %xmm1, %xmm0 652; SSSE3-NEXT: pslld $16, %xmm3 653; SSSE3-NEXT: psrad $16, %xmm3 654; SSSE3-NEXT: pslld $16, %xmm2 655; SSSE3-NEXT: psrad $16, %xmm2 656; SSSE3-NEXT: packssdw %xmm3, %xmm2 657; SSSE3-NEXT: movdqu %xmm2, (%rax) 658; SSSE3-NEXT: movdqu %xmm0, (%rax) 659; SSSE3-NEXT: retq 660; 661; SSE41-LABEL: trunc16i32_16i16: 662; SSE41: # %bb.0: # %entry 663; SSE41-NEXT: pxor %xmm4, %xmm4 664; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 665; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 666; SSE41-NEXT: packusdw %xmm1, %xmm0 667; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 668; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 669; SSE41-NEXT: packusdw %xmm3, %xmm2 670; SSE41-NEXT: movdqu %xmm2, (%rax) 671; SSE41-NEXT: movdqu %xmm0, (%rax) 672; SSE41-NEXT: retq 673; 674; AVX1-LABEL: trunc16i32_16i16: 675; AVX1: # %bb.0: # %entry 676; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 677; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 678; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 679; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 680; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 681; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 682; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 683; AVX1-NEXT: vmovdqu %xmm1, (%rax) 684; AVX1-NEXT: vmovdqu %xmm0, (%rax) 685; AVX1-NEXT: vzeroupper 686; AVX1-NEXT: retq 687; 688; AVX2-LABEL: trunc16i32_16i16: 689; AVX2: # %bb.0: # %entry 690; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 691; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 692; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 693; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 694; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 695; AVX2-NEXT: vmovdqu %ymm0, (%rax) 696; AVX2-NEXT: vzeroupper 697; AVX2-NEXT: retq 698; 699; AVX512-LABEL: trunc16i32_16i16: 700; AVX512: # %bb.0: # %entry 701; AVX512-NEXT: vpmovdw %zmm0, (%rax) 702; AVX512-NEXT: vzeroupper 703; AVX512-NEXT: retq 704entry: 705 %0 = trunc <16 x i32> %a to <16 x i16> 706 store <16 x i16> %0, <16 x i16>* undef, align 4 707 ret void 708} 709 710define void @trunc16i32_16i16_ashr(<16 x i32> %a) { 711; SSE2-LABEL: trunc16i32_16i16_ashr: 712; SSE2: # %bb.0: # %entry 713; SSE2-NEXT: psrad $16, %xmm1 714; SSE2-NEXT: psrad $16, %xmm0 715; SSE2-NEXT: packssdw %xmm1, %xmm0 716; SSE2-NEXT: psrad $16, %xmm3 717; SSE2-NEXT: psrad $16, %xmm2 718; SSE2-NEXT: packssdw %xmm3, %xmm2 719; SSE2-NEXT: movdqu %xmm2, (%rax) 720; SSE2-NEXT: movdqu %xmm0, (%rax) 721; SSE2-NEXT: retq 722; 723; SSSE3-LABEL: trunc16i32_16i16_ashr: 724; SSSE3: # %bb.0: # %entry 725; SSSE3-NEXT: psrad $16, %xmm1 726; SSSE3-NEXT: psrad $16, %xmm0 727; SSSE3-NEXT: packssdw %xmm1, %xmm0 728; SSSE3-NEXT: psrad $16, %xmm3 729; SSSE3-NEXT: psrad $16, %xmm2 730; SSSE3-NEXT: packssdw %xmm3, %xmm2 731; SSSE3-NEXT: movdqu %xmm2, (%rax) 732; SSSE3-NEXT: movdqu %xmm0, (%rax) 733; SSSE3-NEXT: retq 734; 735; SSE41-LABEL: trunc16i32_16i16_ashr: 736; SSE41: # %bb.0: # %entry 737; SSE41-NEXT: psrld $16, %xmm3 738; SSE41-NEXT: psrld $16, %xmm2 739; SSE41-NEXT: packusdw %xmm3, %xmm2 740; SSE41-NEXT: psrld $16, %xmm1 741; SSE41-NEXT: psrld $16, %xmm0 742; SSE41-NEXT: packusdw %xmm1, %xmm0 743; SSE41-NEXT: movdqu %xmm2, (%rax) 744; SSE41-NEXT: movdqu %xmm0, (%rax) 745; SSE41-NEXT: retq 746; 747; AVX1-LABEL: trunc16i32_16i16_ashr: 748; AVX1: # %bb.0: # %entry 749; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 750; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 751; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 752; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 753; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 754; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 755; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 756; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 757; AVX1-NEXT: vmovdqu %xmm1, (%rax) 758; AVX1-NEXT: vmovdqu %xmm0, (%rax) 759; AVX1-NEXT: vzeroupper 760; AVX1-NEXT: retq 761; 762; AVX2-LABEL: trunc16i32_16i16_ashr: 763; AVX2: # %bb.0: # %entry 764; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 765; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 766; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 767; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 768; AVX2-NEXT: vmovdqu %ymm0, (%rax) 769; AVX2-NEXT: vzeroupper 770; AVX2-NEXT: retq 771; 772; AVX512-LABEL: trunc16i32_16i16_ashr: 773; AVX512: # %bb.0: # %entry 774; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 775; AVX512-NEXT: vpmovdw %zmm0, (%rax) 776; AVX512-NEXT: vzeroupper 777; AVX512-NEXT: retq 778entry: 779 %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 780 %1 = trunc <16 x i32> %0 to <16 x i16> 781 store <16 x i16> %1, <16 x i16>* undef, align 4 782 ret void 783} 784 785define void @trunc16i32_16i16_lshr(<16 x i32> %a) { 786; SSE2-LABEL: trunc16i32_16i16_lshr: 787; SSE2: # %bb.0: # %entry 788; SSE2-NEXT: psrad $16, %xmm1 789; SSE2-NEXT: psrad $16, %xmm0 790; SSE2-NEXT: packssdw %xmm1, %xmm0 791; SSE2-NEXT: psrad $16, %xmm3 792; SSE2-NEXT: psrad $16, %xmm2 793; SSE2-NEXT: packssdw %xmm3, %xmm2 794; SSE2-NEXT: movdqu %xmm2, (%rax) 795; SSE2-NEXT: movdqu %xmm0, (%rax) 796; SSE2-NEXT: retq 797; 798; SSSE3-LABEL: trunc16i32_16i16_lshr: 799; SSSE3: # %bb.0: # %entry 800; SSSE3-NEXT: psrad $16, %xmm1 801; SSSE3-NEXT: psrad $16, %xmm0 802; SSSE3-NEXT: packssdw %xmm1, %xmm0 803; SSSE3-NEXT: psrad $16, %xmm3 804; SSSE3-NEXT: psrad $16, %xmm2 805; SSSE3-NEXT: packssdw %xmm3, %xmm2 806; SSSE3-NEXT: movdqu %xmm2, (%rax) 807; SSSE3-NEXT: movdqu %xmm0, (%rax) 808; SSSE3-NEXT: retq 809; 810; SSE41-LABEL: trunc16i32_16i16_lshr: 811; SSE41: # %bb.0: # %entry 812; SSE41-NEXT: psrld $16, %xmm3 813; SSE41-NEXT: psrld $16, %xmm2 814; SSE41-NEXT: packusdw %xmm3, %xmm2 815; SSE41-NEXT: psrld $16, %xmm1 816; SSE41-NEXT: psrld $16, %xmm0 817; SSE41-NEXT: packusdw %xmm1, %xmm0 818; SSE41-NEXT: movdqu %xmm2, (%rax) 819; SSE41-NEXT: movdqu %xmm0, (%rax) 820; SSE41-NEXT: retq 821; 822; AVX1-LABEL: trunc16i32_16i16_lshr: 823; AVX1: # %bb.0: # %entry 824; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 825; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 826; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 827; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 828; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 829; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 830; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 831; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 832; AVX1-NEXT: vmovdqu %xmm1, (%rax) 833; AVX1-NEXT: vmovdqu %xmm0, (%rax) 834; AVX1-NEXT: vzeroupper 835; AVX1-NEXT: retq 836; 837; AVX2-LABEL: trunc16i32_16i16_lshr: 838; AVX2: # %bb.0: # %entry 839; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 840; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 841; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 842; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 843; AVX2-NEXT: vmovdqu %ymm0, (%rax) 844; AVX2-NEXT: vzeroupper 845; AVX2-NEXT: retq 846; 847; AVX512-LABEL: trunc16i32_16i16_lshr: 848; AVX512: # %bb.0: # %entry 849; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 850; AVX512-NEXT: vpmovdw %zmm0, (%rax) 851; AVX512-NEXT: vzeroupper 852; AVX512-NEXT: retq 853entry: 854 %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 855 %1 = trunc <16 x i32> %0 to <16 x i16> 856 store <16 x i16> %1, <16 x i16>* undef, align 4 857 ret void 858} 859 860define void @trunc16i32_16i8(<16 x i32> %a) { 861; SSE2-LABEL: trunc16i32_16i8: 862; SSE2: # %bb.0: # %entry 863; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 864; SSE2-NEXT: pand %xmm4, %xmm3 865; SSE2-NEXT: pand %xmm4, %xmm2 866; SSE2-NEXT: packuswb %xmm3, %xmm2 867; SSE2-NEXT: pand %xmm4, %xmm1 868; SSE2-NEXT: pand %xmm4, %xmm0 869; SSE2-NEXT: packuswb %xmm1, %xmm0 870; SSE2-NEXT: packuswb %xmm2, %xmm0 871; SSE2-NEXT: movdqu %xmm0, (%rax) 872; SSE2-NEXT: retq 873; 874; SSSE3-LABEL: trunc16i32_16i8: 875; SSSE3: # %bb.0: # %entry 876; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 877; SSSE3-NEXT: pand %xmm4, %xmm3 878; SSSE3-NEXT: pand %xmm4, %xmm2 879; SSSE3-NEXT: packuswb %xmm3, %xmm2 880; SSSE3-NEXT: pand %xmm4, %xmm1 881; SSSE3-NEXT: pand %xmm4, %xmm0 882; SSSE3-NEXT: packuswb %xmm1, %xmm0 883; SSSE3-NEXT: packuswb %xmm2, %xmm0 884; SSSE3-NEXT: movdqu %xmm0, (%rax) 885; SSSE3-NEXT: retq 886; 887; SSE41-LABEL: trunc16i32_16i8: 888; SSE41: # %bb.0: # %entry 889; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 890; SSE41-NEXT: pand %xmm4, %xmm3 891; SSE41-NEXT: pand %xmm4, %xmm2 892; SSE41-NEXT: packusdw %xmm3, %xmm2 893; SSE41-NEXT: pand %xmm4, %xmm1 894; SSE41-NEXT: pand %xmm4, %xmm0 895; SSE41-NEXT: packusdw %xmm1, %xmm0 896; SSE41-NEXT: packuswb %xmm2, %xmm0 897; SSE41-NEXT: movdqu %xmm0, (%rax) 898; SSE41-NEXT: retq 899; 900; AVX1-LABEL: trunc16i32_16i8: 901; AVX1: # %bb.0: # %entry 902; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 903; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 904; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 905; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 906; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 907; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 908; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 909; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 910; AVX1-NEXT: vmovdqu %xmm0, (%rax) 911; AVX1-NEXT: vzeroupper 912; AVX1-NEXT: retq 913; 914; AVX2-LABEL: trunc16i32_16i8: 915; AVX2: # %bb.0: # %entry 916; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 917; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 918; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 919; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 920; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 921; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 922; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 923; AVX2-NEXT: vmovdqu %xmm0, (%rax) 924; AVX2-NEXT: vzeroupper 925; AVX2-NEXT: retq 926; 927; AVX512-LABEL: trunc16i32_16i8: 928; AVX512: # %bb.0: # %entry 929; AVX512-NEXT: vpmovdb %zmm0, (%rax) 930; AVX512-NEXT: vzeroupper 931; AVX512-NEXT: retq 932entry: 933 %0 = trunc <16 x i32> %a to <16 x i8> 934 store <16 x i8> %0, <16 x i8>* undef, align 4 935 ret void 936} 937 938define void @trunc16i32_16i8_ashr(<16 x i32> %a) { 939; SSE2-LABEL: trunc16i32_16i8_ashr: 940; SSE2: # %bb.0: # %entry 941; SSE2-NEXT: psrld $24, %xmm1 942; SSE2-NEXT: psrld $24, %xmm0 943; SSE2-NEXT: packuswb %xmm1, %xmm0 944; SSE2-NEXT: psrld $24, %xmm3 945; SSE2-NEXT: psrld $24, %xmm2 946; SSE2-NEXT: packuswb %xmm3, %xmm2 947; SSE2-NEXT: packuswb %xmm2, %xmm0 948; SSE2-NEXT: movdqu %xmm0, (%rax) 949; SSE2-NEXT: retq 950; 951; SSSE3-LABEL: trunc16i32_16i8_ashr: 952; SSSE3: # %bb.0: # %entry 953; SSSE3-NEXT: psrld $24, %xmm1 954; SSSE3-NEXT: psrld $24, %xmm0 955; SSSE3-NEXT: packuswb %xmm1, %xmm0 956; SSSE3-NEXT: psrld $24, %xmm3 957; SSSE3-NEXT: psrld $24, %xmm2 958; SSSE3-NEXT: packuswb %xmm3, %xmm2 959; SSSE3-NEXT: packuswb %xmm2, %xmm0 960; SSSE3-NEXT: movdqu %xmm0, (%rax) 961; SSSE3-NEXT: retq 962; 963; SSE41-LABEL: trunc16i32_16i8_ashr: 964; SSE41: # %bb.0: # %entry 965; SSE41-NEXT: psrld $24, %xmm1 966; SSE41-NEXT: psrld $24, %xmm0 967; SSE41-NEXT: packusdw %xmm1, %xmm0 968; SSE41-NEXT: psrld $24, %xmm3 969; SSE41-NEXT: psrld $24, %xmm2 970; SSE41-NEXT: packusdw %xmm3, %xmm2 971; SSE41-NEXT: packuswb %xmm2, %xmm0 972; SSE41-NEXT: movdqu %xmm0, (%rax) 973; SSE41-NEXT: retq 974; 975; AVX1-LABEL: trunc16i32_16i8_ashr: 976; AVX1: # %bb.0: # %entry 977; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 978; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 979; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 980; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 981; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 982; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 983; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 984; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 985; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 986; AVX1-NEXT: vmovdqu %xmm0, (%rax) 987; AVX1-NEXT: vzeroupper 988; AVX1-NEXT: retq 989; 990; AVX2-LABEL: trunc16i32_16i8_ashr: 991; AVX2: # %bb.0: # %entry 992; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 993; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 994; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 995; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 996; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 997; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 998; AVX2-NEXT: vmovdqu %xmm0, (%rax) 999; AVX2-NEXT: vzeroupper 1000; AVX2-NEXT: retq 1001; 1002; AVX512-LABEL: trunc16i32_16i8_ashr: 1003; AVX512: # %bb.0: # %entry 1004; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 1005; AVX512-NEXT: vpmovdb %zmm0, (%rax) 1006; AVX512-NEXT: vzeroupper 1007; AVX512-NEXT: retq 1008entry: 1009 %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 1010 %1 = trunc <16 x i32> %0 to <16 x i8> 1011 store <16 x i8> %1, <16 x i8>* undef, align 4 1012 ret void 1013} 1014 1015define void @trunc16i32_16i8_lshr(<16 x i32> %a) { 1016; SSE2-LABEL: trunc16i32_16i8_lshr: 1017; SSE2: # %bb.0: # %entry 1018; SSE2-NEXT: psrld $24, %xmm1 1019; SSE2-NEXT: psrld $24, %xmm0 1020; SSE2-NEXT: packuswb %xmm1, %xmm0 1021; SSE2-NEXT: psrld $24, %xmm3 1022; SSE2-NEXT: psrld $24, %xmm2 1023; SSE2-NEXT: packuswb %xmm3, %xmm2 1024; SSE2-NEXT: packuswb %xmm2, %xmm0 1025; SSE2-NEXT: movdqu %xmm0, (%rax) 1026; SSE2-NEXT: retq 1027; 1028; SSSE3-LABEL: trunc16i32_16i8_lshr: 1029; SSSE3: # %bb.0: # %entry 1030; SSSE3-NEXT: psrld $24, %xmm1 1031; SSSE3-NEXT: psrld $24, %xmm0 1032; SSSE3-NEXT: packuswb %xmm1, %xmm0 1033; SSSE3-NEXT: psrld $24, %xmm3 1034; SSSE3-NEXT: psrld $24, %xmm2 1035; SSSE3-NEXT: packuswb %xmm3, %xmm2 1036; SSSE3-NEXT: packuswb %xmm2, %xmm0 1037; SSSE3-NEXT: movdqu %xmm0, (%rax) 1038; SSSE3-NEXT: retq 1039; 1040; SSE41-LABEL: trunc16i32_16i8_lshr: 1041; SSE41: # %bb.0: # %entry 1042; SSE41-NEXT: psrld $24, %xmm1 1043; SSE41-NEXT: psrld $24, %xmm0 1044; SSE41-NEXT: packusdw %xmm1, %xmm0 1045; SSE41-NEXT: psrld $24, %xmm3 1046; SSE41-NEXT: psrld $24, %xmm2 1047; SSE41-NEXT: packusdw %xmm3, %xmm2 1048; SSE41-NEXT: packuswb %xmm2, %xmm0 1049; SSE41-NEXT: movdqu %xmm0, (%rax) 1050; SSE41-NEXT: retq 1051; 1052; AVX1-LABEL: trunc16i32_16i8_lshr: 1053; AVX1: # %bb.0: # %entry 1054; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1055; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1056; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 1057; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1058; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1059; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1060; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 1061; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1062; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1063; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1064; AVX1-NEXT: vzeroupper 1065; AVX1-NEXT: retq 1066; 1067; AVX2-LABEL: trunc16i32_16i8_lshr: 1068; AVX2: # %bb.0: # %entry 1069; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 1070; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 1071; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1072; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1073; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1074; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1075; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1076; AVX2-NEXT: vzeroupper 1077; AVX2-NEXT: retq 1078; 1079; AVX512-LABEL: trunc16i32_16i8_lshr: 1080; AVX512: # %bb.0: # %entry 1081; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 1082; AVX512-NEXT: vpmovdb %zmm0, (%rax) 1083; AVX512-NEXT: vzeroupper 1084; AVX512-NEXT: retq 1085entry: 1086 %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 1087 %1 = trunc <16 x i32> %0 to <16 x i8> 1088 store <16 x i8> %1, <16 x i8>* undef, align 4 1089 ret void 1090} 1091 1092;PR25684 1093define void @trunc16i16_16i8(<16 x i16> %a) { 1094; SSE-LABEL: trunc16i16_16i8: 1095; SSE: # %bb.0: # %entry 1096; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1097; SSE-NEXT: pand %xmm2, %xmm1 1098; SSE-NEXT: pand %xmm2, %xmm0 1099; SSE-NEXT: packuswb %xmm1, %xmm0 1100; SSE-NEXT: movdqu %xmm0, (%rax) 1101; SSE-NEXT: retq 1102; 1103; AVX1-LABEL: trunc16i16_16i8: 1104; AVX1: # %bb.0: # %entry 1105; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1106; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1107; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1108; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1109; AVX1-NEXT: vzeroupper 1110; AVX1-NEXT: retq 1111; 1112; AVX2-LABEL: trunc16i16_16i8: 1113; AVX2: # %bb.0: # %entry 1114; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1115; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1116; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1117; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1118; AVX2-NEXT: vzeroupper 1119; AVX2-NEXT: retq 1120; 1121; AVX512F-LABEL: trunc16i16_16i8: 1122; AVX512F: # %bb.0: # %entry 1123; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1124; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1125; AVX512F-NEXT: vzeroupper 1126; AVX512F-NEXT: retq 1127; 1128; AVX512VL-LABEL: trunc16i16_16i8: 1129; AVX512VL: # %bb.0: # %entry 1130; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1131; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1132; AVX512VL-NEXT: vzeroupper 1133; AVX512VL-NEXT: retq 1134; 1135; AVX512BW-LABEL: trunc16i16_16i8: 1136; AVX512BW: # %bb.0: # %entry 1137; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1138; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1139; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1140; AVX512BW-NEXT: vzeroupper 1141; AVX512BW-NEXT: retq 1142; 1143; AVX512BWVL-LABEL: trunc16i16_16i8: 1144; AVX512BWVL: # %bb.0: # %entry 1145; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1146; AVX512BWVL-NEXT: vzeroupper 1147; AVX512BWVL-NEXT: retq 1148entry: 1149 %0 = trunc <16 x i16> %a to <16 x i8> 1150 store <16 x i8> %0, <16 x i8>* undef, align 4 1151 ret void 1152} 1153 1154define void @trunc16i16_16i8_ashr(<16 x i16> %a) { 1155; SSE-LABEL: trunc16i16_16i8_ashr: 1156; SSE: # %bb.0: # %entry 1157; SSE-NEXT: psrlw $8, %xmm1 1158; SSE-NEXT: psrlw $8, %xmm0 1159; SSE-NEXT: packuswb %xmm1, %xmm0 1160; SSE-NEXT: movdqu %xmm0, (%rax) 1161; SSE-NEXT: retq 1162; 1163; AVX1-LABEL: trunc16i16_16i8_ashr: 1164; AVX1: # %bb.0: # %entry 1165; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1166; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1167; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1168; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1169; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1170; AVX1-NEXT: vzeroupper 1171; AVX1-NEXT: retq 1172; 1173; AVX2-LABEL: trunc16i16_16i8_ashr: 1174; AVX2: # %bb.0: # %entry 1175; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1176; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1177; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1178; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1179; AVX2-NEXT: vzeroupper 1180; AVX2-NEXT: retq 1181; 1182; AVX512F-LABEL: trunc16i16_16i8_ashr: 1183; AVX512F: # %bb.0: # %entry 1184; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1185; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1186; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1187; AVX512F-NEXT: vzeroupper 1188; AVX512F-NEXT: retq 1189; 1190; AVX512VL-LABEL: trunc16i16_16i8_ashr: 1191; AVX512VL: # %bb.0: # %entry 1192; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1193; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1194; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1195; AVX512VL-NEXT: vzeroupper 1196; AVX512VL-NEXT: retq 1197; 1198; AVX512BW-LABEL: trunc16i16_16i8_ashr: 1199; AVX512BW: # %bb.0: # %entry 1200; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1201; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1202; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1203; AVX512BW-NEXT: vzeroupper 1204; AVX512BW-NEXT: retq 1205; 1206; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: 1207; AVX512BWVL: # %bb.0: # %entry 1208; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1209; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1210; AVX512BWVL-NEXT: vzeroupper 1211; AVX512BWVL-NEXT: retq 1212entry: 1213 %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1214 %1 = trunc <16 x i16> %0 to <16 x i8> 1215 store <16 x i8> %1, <16 x i8>* undef, align 4 1216 ret void 1217} 1218 1219define void @trunc16i16_16i8_lshr(<16 x i16> %a) { 1220; SSE-LABEL: trunc16i16_16i8_lshr: 1221; SSE: # %bb.0: # %entry 1222; SSE-NEXT: psrlw $8, %xmm1 1223; SSE-NEXT: psrlw $8, %xmm0 1224; SSE-NEXT: packuswb %xmm1, %xmm0 1225; SSE-NEXT: movdqu %xmm0, (%rax) 1226; SSE-NEXT: retq 1227; 1228; AVX1-LABEL: trunc16i16_16i8_lshr: 1229; AVX1: # %bb.0: # %entry 1230; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1231; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1232; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1233; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1234; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1235; AVX1-NEXT: vzeroupper 1236; AVX1-NEXT: retq 1237; 1238; AVX2-LABEL: trunc16i16_16i8_lshr: 1239; AVX2: # %bb.0: # %entry 1240; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1241; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1242; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1243; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1244; AVX2-NEXT: vzeroupper 1245; AVX2-NEXT: retq 1246; 1247; AVX512F-LABEL: trunc16i16_16i8_lshr: 1248; AVX512F: # %bb.0: # %entry 1249; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1250; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1251; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1252; AVX512F-NEXT: vzeroupper 1253; AVX512F-NEXT: retq 1254; 1255; AVX512VL-LABEL: trunc16i16_16i8_lshr: 1256; AVX512VL: # %bb.0: # %entry 1257; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1258; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1259; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1260; AVX512VL-NEXT: vzeroupper 1261; AVX512VL-NEXT: retq 1262; 1263; AVX512BW-LABEL: trunc16i16_16i8_lshr: 1264; AVX512BW: # %bb.0: # %entry 1265; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1266; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1267; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1268; AVX512BW-NEXT: vzeroupper 1269; AVX512BW-NEXT: retq 1270; 1271; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: 1272; AVX512BWVL: # %bb.0: # %entry 1273; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1274; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1275; AVX512BWVL-NEXT: vzeroupper 1276; AVX512BWVL-NEXT: retq 1277entry: 1278 %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1279 %1 = trunc <16 x i16> %0 to <16 x i8> 1280 store <16 x i8> %1, <16 x i8>* undef, align 4 1281 ret void 1282} 1283 1284define void @trunc32i16_32i8(<32 x i16> %a) { 1285; SSE-LABEL: trunc32i16_32i8: 1286; SSE: # %bb.0: # %entry 1287; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1288; SSE-NEXT: pand %xmm4, %xmm1 1289; SSE-NEXT: pand %xmm4, %xmm0 1290; SSE-NEXT: packuswb %xmm1, %xmm0 1291; SSE-NEXT: pand %xmm4, %xmm3 1292; SSE-NEXT: pand %xmm4, %xmm2 1293; SSE-NEXT: packuswb %xmm3, %xmm2 1294; SSE-NEXT: movdqu %xmm2, (%rax) 1295; SSE-NEXT: movdqu %xmm0, (%rax) 1296; SSE-NEXT: retq 1297; 1298; AVX1-LABEL: trunc32i16_32i8: 1299; AVX1: # %bb.0: # %entry 1300; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1301; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1302; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1303; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1304; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1305; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1306; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1307; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1308; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1309; AVX1-NEXT: vzeroupper 1310; AVX1-NEXT: retq 1311; 1312; AVX2-LABEL: trunc32i16_32i8: 1313; AVX2: # %bb.0: # %entry 1314; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1315; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1316; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1317; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1318; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1319; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1320; AVX2-NEXT: vzeroupper 1321; AVX2-NEXT: retq 1322; 1323; AVX512F-LABEL: trunc32i16_32i8: 1324; AVX512F: # %bb.0: # %entry 1325; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1326; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1327; AVX512F-NEXT: vpmovdb %zmm1, (%rax) 1328; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1329; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1330; AVX512F-NEXT: vzeroupper 1331; AVX512F-NEXT: retq 1332; 1333; AVX512VL-LABEL: trunc32i16_32i8: 1334; AVX512VL: # %bb.0: # %entry 1335; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1336; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1337; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) 1338; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1339; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1340; AVX512VL-NEXT: vzeroupper 1341; AVX512VL-NEXT: retq 1342; 1343; AVX512BW-LABEL: trunc32i16_32i8: 1344; AVX512BW: # %bb.0: # %entry 1345; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) 1346; AVX512BW-NEXT: vzeroupper 1347; AVX512BW-NEXT: retq 1348; 1349; AVX512BWVL-LABEL: trunc32i16_32i8: 1350; AVX512BWVL: # %bb.0: # %entry 1351; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) 1352; AVX512BWVL-NEXT: vzeroupper 1353; AVX512BWVL-NEXT: retq 1354entry: 1355 %0 = trunc <32 x i16> %a to <32 x i8> 1356 store <32 x i8> %0, <32 x i8>* undef, align 4 1357 ret void 1358} 1359 1360define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { 1361; SSE-LABEL: trunc2x4i64_8i32: 1362; SSE: # %bb.0: # %entry 1363; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1364; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1365; SSE-NEXT: movaps %xmm2, %xmm1 1366; SSE-NEXT: retq 1367; 1368; AVX1-LABEL: trunc2x4i64_8i32: 1369; AVX1: # %bb.0: # %entry 1370; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1371; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1372; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1373; AVX1-NEXT: retq 1374; 1375; AVX2-SLOW-LABEL: trunc2x4i64_8i32: 1376; AVX2-SLOW: # %bb.0: # %entry 1377; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1378; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1379; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1380; AVX2-SLOW-NEXT: retq 1381; 1382; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32: 1383; AVX2-FAST-ALL: # %bb.0: # %entry 1384; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1385; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 1386; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 1387; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1388; AVX2-FAST-ALL-NEXT: retq 1389; 1390; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32: 1391; AVX2-FAST-PERLANE: # %bb.0: # %entry 1392; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1393; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1394; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1395; AVX2-FAST-PERLANE-NEXT: retq 1396; 1397; AVX512F-LABEL: trunc2x4i64_8i32: 1398; AVX512F: # %bb.0: # %entry 1399; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1400; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1401; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1402; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1403; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1404; AVX512F-NEXT: retq 1405; 1406; AVX512VL-LABEL: trunc2x4i64_8i32: 1407; AVX512VL: # %bb.0: # %entry 1408; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1409; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 1410; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1411; AVX512VL-NEXT: retq 1412; 1413; AVX512BW-LABEL: trunc2x4i64_8i32: 1414; AVX512BW: # %bb.0: # %entry 1415; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1416; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1417; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1418; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1419; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1420; AVX512BW-NEXT: retq 1421; 1422; AVX512BWVL-LABEL: trunc2x4i64_8i32: 1423; AVX512BWVL: # %bb.0: # %entry 1424; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1425; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 1426; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1427; AVX512BWVL-NEXT: retq 1428entry: 1429 %0 = trunc <4 x i64> %a to <4 x i32> 1430 %1 = trunc <4 x i64> %b to <4 x i32> 1431 %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1432 ret <8 x i32> %2 1433} 1434 1435define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { 1436; SSE2-LABEL: trunc2x4i64_8i16: 1437; SSE2: # %bb.0: # %entry 1438; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1439; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1440; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1441; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1442; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1443; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1444; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1445; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1446; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1447; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1448; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1449; SSE2-NEXT: retq 1450; 1451; SSSE3-LABEL: trunc2x4i64_8i16: 1452; SSSE3: # %bb.0: # %entry 1453; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1454; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1455; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1456; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1457; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1458; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1459; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1460; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1461; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1462; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1463; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1464; SSSE3-NEXT: retq 1465; 1466; SSE41-LABEL: trunc2x4i64_8i16: 1467; SSE41: # %bb.0: # %entry 1468; SSE41-NEXT: pxor %xmm4, %xmm4 1469; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 1470; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 1471; SSE41-NEXT: packusdw %xmm3, %xmm2 1472; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 1473; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 1474; SSE41-NEXT: packusdw %xmm1, %xmm0 1475; SSE41-NEXT: packusdw %xmm2, %xmm0 1476; SSE41-NEXT: retq 1477; 1478; AVX1-LABEL: trunc2x4i64_8i16: 1479; AVX1: # %bb.0: # %entry 1480; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1481; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1482; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1483; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 1484; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1485; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1486; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1487; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 1488; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1489; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1490; AVX1-NEXT: vzeroupper 1491; AVX1-NEXT: retq 1492; 1493; AVX2-LABEL: trunc2x4i64_8i16: 1494; AVX2: # %bb.0: # %entry 1495; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1496; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1497; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1498; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1499; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1500; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1501; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1502; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1503; AVX2-NEXT: vzeroupper 1504; AVX2-NEXT: retq 1505; 1506; AVX512F-LABEL: trunc2x4i64_8i16: 1507; AVX512F: # %bb.0: # %entry 1508; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1509; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1510; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1511; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1512; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1513; AVX512F-NEXT: vzeroupper 1514; AVX512F-NEXT: retq 1515; 1516; AVX512VL-LABEL: trunc2x4i64_8i16: 1517; AVX512VL: # %bb.0: # %entry 1518; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 1519; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 1520; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1521; AVX512VL-NEXT: vzeroupper 1522; AVX512VL-NEXT: retq 1523; 1524; AVX512BW-LABEL: trunc2x4i64_8i16: 1525; AVX512BW: # %bb.0: # %entry 1526; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1527; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1528; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1529; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1530; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1531; AVX512BW-NEXT: vzeroupper 1532; AVX512BW-NEXT: retq 1533; 1534; AVX512BWVL-LABEL: trunc2x4i64_8i16: 1535; AVX512BWVL: # %bb.0: # %entry 1536; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 1537; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 1538; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1539; AVX512BWVL-NEXT: vzeroupper 1540; AVX512BWVL-NEXT: retq 1541entry: 1542 %0 = trunc <4 x i64> %a to <4 x i16> 1543 %1 = trunc <4 x i64> %b to <4 x i16> 1544 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1545 ret <8 x i16> %2 1546} 1547 1548define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { 1549; SSE-LABEL: trunc2x2i64_4i32: 1550; SSE: # %bb.0: # %entry 1551; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1552; SSE-NEXT: retq 1553; 1554; AVX-LABEL: trunc2x2i64_4i32: 1555; AVX: # %bb.0: # %entry 1556; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1557; AVX-NEXT: retq 1558; 1559; AVX512F-LABEL: trunc2x2i64_4i32: 1560; AVX512F: # %bb.0: # %entry 1561; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1562; AVX512F-NEXT: retq 1563; 1564; AVX512VL-LABEL: trunc2x2i64_4i32: 1565; AVX512VL: # %bb.0: # %entry 1566; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1567; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1568; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1569; AVX512VL-NEXT: vzeroupper 1570; AVX512VL-NEXT: retq 1571; 1572; AVX512BW-LABEL: trunc2x2i64_4i32: 1573; AVX512BW: # %bb.0: # %entry 1574; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1575; AVX512BW-NEXT: retq 1576; 1577; AVX512BWVL-LABEL: trunc2x2i64_4i32: 1578; AVX512BWVL: # %bb.0: # %entry 1579; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1580; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1581; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1582; AVX512BWVL-NEXT: vzeroupper 1583; AVX512BWVL-NEXT: retq 1584entry: 1585 %0 = trunc <2 x i64> %a to <2 x i32> 1586 %1 = trunc <2 x i64> %b to <2 x i32> 1587 %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1588 ret <4 x i32> %2 1589} 1590 1591define i64 @trunc2i64_i64(<2 x i64> %inval) { 1592; SSE-LABEL: trunc2i64_i64: 1593; SSE: # %bb.0: # %entry 1594; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1595; SSE-NEXT: movq %xmm0, %rax 1596; SSE-NEXT: retq 1597; 1598; AVX-LABEL: trunc2i64_i64: 1599; AVX: # %bb.0: # %entry 1600; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1601; AVX-NEXT: vmovq %xmm0, %rax 1602; AVX-NEXT: retq 1603; 1604; AVX512-LABEL: trunc2i64_i64: 1605; AVX512: # %bb.0: # %entry 1606; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1607; AVX512-NEXT: vmovq %xmm0, %rax 1608; AVX512-NEXT: retq 1609entry: 1610 %0 = trunc <2 x i64> %inval to <2 x i32> 1611 %1 = bitcast <2 x i32> %0 to i64 1612 ret i64 %1 1613} 1614 1615define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { 1616; SSE2-LABEL: trunc2x4i32_8i16: 1617; SSE2: # %bb.0: # %entry 1618; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1619; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 1620; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1621; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1622; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1623; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1624; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1625; SSE2-NEXT: retq 1626; 1627; SSSE3-LABEL: trunc2x4i32_8i16: 1628; SSSE3: # %bb.0: # %entry 1629; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1630; SSSE3-NEXT: pshufb %xmm2, %xmm1 1631; SSSE3-NEXT: pshufb %xmm2, %xmm0 1632; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1633; SSSE3-NEXT: retq 1634; 1635; SSE41-LABEL: trunc2x4i32_8i16: 1636; SSE41: # %bb.0: # %entry 1637; SSE41-NEXT: pxor %xmm2, %xmm2 1638; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1639; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1640; SSE41-NEXT: packusdw %xmm1, %xmm0 1641; SSE41-NEXT: retq 1642; 1643; AVX-LABEL: trunc2x4i32_8i16: 1644; AVX: # %bb.0: # %entry 1645; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1646; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1647; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1648; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1649; AVX-NEXT: retq 1650; 1651; AVX512F-LABEL: trunc2x4i32_8i16: 1652; AVX512F: # %bb.0: # %entry 1653; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1654; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1655; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1656; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1657; AVX512F-NEXT: vzeroupper 1658; AVX512F-NEXT: retq 1659; 1660; AVX512VL-LABEL: trunc2x4i32_8i16: 1661; AVX512VL: # %bb.0: # %entry 1662; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1663; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1664; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1665; AVX512VL-NEXT: vzeroupper 1666; AVX512VL-NEXT: retq 1667; 1668; AVX512BW-LABEL: trunc2x4i32_8i16: 1669; AVX512BW: # %bb.0: # %entry 1670; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1671; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1672; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1673; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1674; AVX512BW-NEXT: vzeroupper 1675; AVX512BW-NEXT: retq 1676; 1677; AVX512BWVL-LABEL: trunc2x4i32_8i16: 1678; AVX512BWVL: # %bb.0: # %entry 1679; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1680; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1681; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1682; AVX512BWVL-NEXT: vzeroupper 1683; AVX512BWVL-NEXT: retq 1684entry: 1685 %0 = trunc <4 x i32> %a to <4 x i16> 1686 %1 = trunc <4 x i32> %b to <4 x i16> 1687 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1688 ret <8 x i16> %2 1689} 1690 1691; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1692define i64 @trunc4i32_i64(<4 x i32> %inval) { 1693; SSE2-LABEL: trunc4i32_i64: 1694; SSE2: # %bb.0: # %entry 1695; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1696; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1697; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1698; SSE2-NEXT: movq %xmm0, %rax 1699; SSE2-NEXT: retq 1700; 1701; SSSE3-LABEL: trunc4i32_i64: 1702; SSSE3: # %bb.0: # %entry 1703; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1704; SSSE3-NEXT: movq %xmm0, %rax 1705; SSSE3-NEXT: retq 1706; 1707; SSE41-LABEL: trunc4i32_i64: 1708; SSE41: # %bb.0: # %entry 1709; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1710; SSE41-NEXT: movq %xmm0, %rax 1711; SSE41-NEXT: retq 1712; 1713; AVX-LABEL: trunc4i32_i64: 1714; AVX: # %bb.0: # %entry 1715; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1716; AVX-NEXT: vmovq %xmm0, %rax 1717; AVX-NEXT: retq 1718; 1719; AVX512F-LABEL: trunc4i32_i64: 1720; AVX512F: # %bb.0: # %entry 1721; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1722; AVX512F-NEXT: vmovq %xmm0, %rax 1723; AVX512F-NEXT: retq 1724; 1725; AVX512VL-LABEL: trunc4i32_i64: 1726; AVX512VL: # %bb.0: # %entry 1727; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 1728; AVX512VL-NEXT: vmovq %xmm0, %rax 1729; AVX512VL-NEXT: retq 1730; 1731; AVX512BW-LABEL: trunc4i32_i64: 1732; AVX512BW: # %bb.0: # %entry 1733; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1734; AVX512BW-NEXT: vmovq %xmm0, %rax 1735; AVX512BW-NEXT: retq 1736; 1737; AVX512BWVL-LABEL: trunc4i32_i64: 1738; AVX512BWVL: # %bb.0: # %entry 1739; AVX512BWVL-NEXT: vpmovdw %xmm0, %xmm0 1740; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1741; AVX512BWVL-NEXT: retq 1742entry: 1743 %0 = trunc <4 x i32> %inval to <4 x i16> 1744 %1 = bitcast <4 x i16> %0 to i64 1745 ret i64 %1 1746} 1747 1748define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { 1749; SSE2-LABEL: trunc2x8i16_16i8: 1750; SSE2: # %bb.0: # %entry 1751; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1752; SSE2-NEXT: pand %xmm2, %xmm0 1753; SSE2-NEXT: pand %xmm2, %xmm1 1754; SSE2-NEXT: packuswb %xmm1, %xmm0 1755; SSE2-NEXT: retq 1756; 1757; SSSE3-LABEL: trunc2x8i16_16i8: 1758; SSSE3: # %bb.0: # %entry 1759; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1760; SSSE3-NEXT: pand %xmm2, %xmm1 1761; SSSE3-NEXT: pand %xmm2, %xmm0 1762; SSSE3-NEXT: packuswb %xmm1, %xmm0 1763; SSSE3-NEXT: retq 1764; 1765; SSE41-LABEL: trunc2x8i16_16i8: 1766; SSE41: # %bb.0: # %entry 1767; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1768; SSE41-NEXT: pand %xmm2, %xmm1 1769; SSE41-NEXT: pand %xmm2, %xmm0 1770; SSE41-NEXT: packuswb %xmm1, %xmm0 1771; SSE41-NEXT: retq 1772; 1773; AVX-LABEL: trunc2x8i16_16i8: 1774; AVX: # %bb.0: # %entry 1775; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1776; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 1777; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 1778; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1779; AVX-NEXT: retq 1780; 1781; AVX512F-LABEL: trunc2x8i16_16i8: 1782; AVX512F: # %bb.0: # %entry 1783; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1784; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 1785; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 1786; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1787; AVX512F-NEXT: retq 1788; 1789; AVX512VL-LABEL: trunc2x8i16_16i8: 1790; AVX512VL: # %bb.0: # %entry 1791; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1792; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 1793; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 1794; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1795; AVX512VL-NEXT: retq 1796; 1797; AVX512BW-LABEL: trunc2x8i16_16i8: 1798; AVX512BW: # %bb.0: # %entry 1799; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1800; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1801; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1802; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1803; AVX512BW-NEXT: vzeroupper 1804; AVX512BW-NEXT: retq 1805; 1806; AVX512BWVL-LABEL: trunc2x8i16_16i8: 1807; AVX512BWVL: # %bb.0: # %entry 1808; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1809; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1810; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1811; AVX512BWVL-NEXT: vzeroupper 1812; AVX512BWVL-NEXT: retq 1813entry: 1814 %0 = trunc <8 x i16> %a to <8 x i8> 1815 %1 = trunc <8 x i16> %b to <8 x i8> 1816 %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1817 ret <16 x i8> %2 1818} 1819 1820; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1821define i64 @trunc8i16_i64(<8 x i16> %inval) { 1822; SSE2-LABEL: trunc8i16_i64: 1823; SSE2: # %bb.0: # %entry 1824; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1825; SSE2-NEXT: packuswb %xmm0, %xmm0 1826; SSE2-NEXT: movq %xmm0, %rax 1827; SSE2-NEXT: retq 1828; 1829; SSSE3-LABEL: trunc8i16_i64: 1830; SSSE3: # %bb.0: # %entry 1831; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1832; SSSE3-NEXT: movq %xmm0, %rax 1833; SSSE3-NEXT: retq 1834; 1835; SSE41-LABEL: trunc8i16_i64: 1836; SSE41: # %bb.0: # %entry 1837; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1838; SSE41-NEXT: movq %xmm0, %rax 1839; SSE41-NEXT: retq 1840; 1841; AVX-LABEL: trunc8i16_i64: 1842; AVX: # %bb.0: # %entry 1843; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1844; AVX-NEXT: vmovq %xmm0, %rax 1845; AVX-NEXT: retq 1846; 1847; AVX512F-LABEL: trunc8i16_i64: 1848; AVX512F: # %bb.0: # %entry 1849; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1850; AVX512F-NEXT: vmovq %xmm0, %rax 1851; AVX512F-NEXT: retq 1852; 1853; AVX512VL-LABEL: trunc8i16_i64: 1854; AVX512VL: # %bb.0: # %entry 1855; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1856; AVX512VL-NEXT: vmovq %xmm0, %rax 1857; AVX512VL-NEXT: retq 1858; 1859; AVX512BW-LABEL: trunc8i16_i64: 1860; AVX512BW: # %bb.0: # %entry 1861; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1862; AVX512BW-NEXT: vmovq %xmm0, %rax 1863; AVX512BW-NEXT: retq 1864; 1865; AVX512BWVL-LABEL: trunc8i16_i64: 1866; AVX512BWVL: # %bb.0: # %entry 1867; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 1868; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1869; AVX512BWVL-NEXT: retq 1870entry: 1871 %0 = trunc <8 x i16> %inval to <8 x i8> 1872 %1 = bitcast <8 x i8> %0 to i64 1873 ret i64 %1 1874} 1875 1876define <16 x i8> @trunc16i64_16i8_const() { 1877; SSE-LABEL: trunc16i64_16i8_const: 1878; SSE: # %bb.0: # %entry 1879; SSE-NEXT: xorps %xmm0, %xmm0 1880; SSE-NEXT: retq 1881; 1882; AVX-LABEL: trunc16i64_16i8_const: 1883; AVX: # %bb.0: # %entry 1884; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1885; AVX-NEXT: retq 1886; 1887; AVX512-LABEL: trunc16i64_16i8_const: 1888; AVX512: # %bb.0: # %entry 1889; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1890; AVX512-NEXT: retq 1891 1892entry: 1893 %0 = trunc <16 x i64> zeroinitializer to <16 x i8> 1894 %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26> 1895 ret <16 x i8> %1 1896} 1897 1898define <8 x i16> @PR32160(<8 x i32> %x) { 1899; SSE-LABEL: PR32160: 1900; SSE: # %bb.0: 1901; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 1902; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1903; SSE-NEXT: retq 1904; 1905; AVX-LABEL: PR32160: 1906; AVX: # %bb.0: 1907; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] 1908; AVX-NEXT: vzeroupper 1909; AVX-NEXT: retq 1910; 1911; AVX512F-LABEL: PR32160: 1912; AVX512F: # %bb.0: 1913; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1914; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1915; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1916; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 1917; AVX512F-NEXT: vzeroupper 1918; AVX512F-NEXT: retq 1919; 1920; AVX512VL-LABEL: PR32160: 1921; AVX512VL: # %bb.0: 1922; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1923; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1924; AVX512VL-NEXT: vzeroupper 1925; AVX512VL-NEXT: retq 1926; 1927; AVX512BW-LABEL: PR32160: 1928; AVX512BW: # %bb.0: 1929; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1930; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1931; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1932; AVX512BW-NEXT: vzeroupper 1933; AVX512BW-NEXT: retq 1934; 1935; AVX512BWVL-LABEL: PR32160: 1936; AVX512BWVL: # %bb.0: 1937; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1938; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1939; AVX512BWVL-NEXT: vzeroupper 1940; AVX512BWVL-NEXT: retq 1941 %shuf = trunc <8 x i32> %x to <8 x i16> 1942 %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1943 ret <8 x i16> %trunc 1944} 1945 1946define void @PR34773(i16* %a0, i8* %a1) { 1947; SSE-LABEL: PR34773: 1948; SSE: # %bb.0: 1949; SSE-NEXT: movdqu (%rdi), %xmm0 1950; SSE-NEXT: movdqu 16(%rdi), %xmm1 1951; SSE-NEXT: movdqu 32(%rdi), %xmm2 1952; SSE-NEXT: movdqu 48(%rdi), %xmm3 1953; SSE-NEXT: psrlw $8, %xmm1 1954; SSE-NEXT: psrlw $8, %xmm0 1955; SSE-NEXT: packuswb %xmm1, %xmm0 1956; SSE-NEXT: psrlw $8, %xmm3 1957; SSE-NEXT: psrlw $8, %xmm2 1958; SSE-NEXT: packuswb %xmm3, %xmm2 1959; SSE-NEXT: movdqu %xmm0, (%rsi) 1960; SSE-NEXT: movdqu %xmm2, 16(%rsi) 1961; SSE-NEXT: retq 1962; 1963; AVX1-LABEL: PR34773: 1964; AVX1: # %bb.0: 1965; AVX1-NEXT: vmovdqu (%rdi), %xmm0 1966; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 1967; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 1968; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 1969; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1970; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1971; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1972; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1973; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1974; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1975; AVX1-NEXT: vmovdqu %xmm0, (%rsi) 1976; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) 1977; AVX1-NEXT: retq 1978; 1979; AVX2-LABEL: PR34773: 1980; AVX2: # %bb.0: 1981; AVX2-NEXT: vmovdqu (%rdi), %ymm0 1982; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 1983; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1984; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1985; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1986; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1987; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1988; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1989; AVX2-NEXT: vmovdqu %xmm0, (%rsi) 1990; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) 1991; AVX2-NEXT: vzeroupper 1992; AVX2-NEXT: retq 1993; 1994; AVX512F-LABEL: PR34773: 1995; AVX512F: # %bb.0: 1996; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 1997; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 1998; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1999; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 2000; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2001; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 2002; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2003; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) 2004; AVX512F-NEXT: vzeroupper 2005; AVX512F-NEXT: retq 2006; 2007; AVX512VL-LABEL: PR34773: 2008; AVX512VL: # %bb.0: 2009; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 2010; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 2011; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 2012; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 2013; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2014; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 2015; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2016; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) 2017; AVX512VL-NEXT: vzeroupper 2018; AVX512VL-NEXT: retq 2019; 2020; AVX512BW-LABEL: PR34773: 2021; AVX512BW: # %bb.0: 2022; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 2023; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 2024; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 2025; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 2026; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2027; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 2028; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 2029; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) 2030; AVX512BW-NEXT: vzeroupper 2031; AVX512BW-NEXT: retq 2032; 2033; AVX512BWVL-LABEL: PR34773: 2034; AVX512BWVL: # %bb.0: 2035; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 2036; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 2037; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 2038; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) 2039; AVX512BWVL-NEXT: vzeroupper 2040; AVX512BWVL-NEXT: retq 2041 %1 = getelementptr i16, i16* %a0, i64 16 2042 %2 = getelementptr i8, i8* %a1, i64 16 2043 %3 = bitcast i16* %a0 to <16 x i16>* 2044 %4 = bitcast i16* %1 to <16 x i16>* 2045 %5 = bitcast i8* %a1 to <16 x i8>* 2046 %6 = bitcast i8* %2 to <16 x i8>* 2047 %7 = load <16 x i16>, <16 x i16>* %3, align 2 2048 %8 = load <16 x i16>, <16 x i16>* %4, align 2 2049 %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2050 %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2051 %11 = trunc <16 x i16> %9 to <16 x i8> 2052 %12 = trunc <16 x i16> %10 to <16 x i8> 2053 store <16 x i8> %11, <16 x i8>* %5, align 1 2054 store <16 x i8> %12, <16 x i8>* %6, align 1 2055 ret void 2056} 2057 2058; Store merging must not infinitely fight store splitting. 2059 2060define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 { 2061; SSE2-LABEL: store_merge_split: 2062; SSE2: # %bb.0: 2063; SSE2-NEXT: pslld $16, %xmm1 2064; SSE2-NEXT: psrad $16, %xmm1 2065; SSE2-NEXT: pslld $16, %xmm0 2066; SSE2-NEXT: psrad $16, %xmm0 2067; SSE2-NEXT: packssdw %xmm1, %xmm0 2068; SSE2-NEXT: pslld $16, %xmm3 2069; SSE2-NEXT: psrad $16, %xmm3 2070; SSE2-NEXT: pslld $16, %xmm2 2071; SSE2-NEXT: psrad $16, %xmm2 2072; SSE2-NEXT: packssdw %xmm3, %xmm2 2073; SSE2-NEXT: shlq $4, %rdi 2074; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) 2075; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2076; SSE2-NEXT: retq 2077; 2078; SSSE3-LABEL: store_merge_split: 2079; SSSE3: # %bb.0: 2080; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2081; SSSE3-NEXT: pshufb %xmm4, %xmm1 2082; SSSE3-NEXT: pshufb %xmm4, %xmm0 2083; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2084; SSSE3-NEXT: pshufb %xmm4, %xmm3 2085; SSSE3-NEXT: pshufb %xmm4, %xmm2 2086; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2087; SSSE3-NEXT: shlq $4, %rdi 2088; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) 2089; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2090; SSSE3-NEXT: retq 2091; 2092; SSE41-LABEL: store_merge_split: 2093; SSE41: # %bb.0: 2094; SSE41-NEXT: pxor %xmm4, %xmm4 2095; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 2096; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 2097; SSE41-NEXT: packusdw %xmm1, %xmm0 2098; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 2099; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 2100; SSE41-NEXT: packusdw %xmm3, %xmm2 2101; SSE41-NEXT: shlq $4, %rdi 2102; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) 2103; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2104; SSE41-NEXT: retq 2105; 2106; AVX1-LABEL: store_merge_split: 2107; AVX1: # %bb.0: 2108; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2109; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2110; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2111; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2112; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 2113; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2114; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2115; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2116; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 2117; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2118; AVX1-NEXT: shlq $4, %rdi 2119; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2120; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2121; AVX1-NEXT: vzeroupper 2122; AVX1-NEXT: retq 2123; 2124; AVX2-LABEL: store_merge_split: 2125; AVX2: # %bb.0: 2126; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2127; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2128; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2129; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2130; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2131; AVX2-NEXT: shlq $4, %rdi 2132; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2133; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2134; AVX2-NEXT: vzeroupper 2135; AVX2-NEXT: retq 2136; 2137; AVX512F-LABEL: store_merge_split: 2138; AVX512F: # %bb.0: 2139; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2140; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2141; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2142; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 2143; AVX512F-NEXT: shlq $4, %rdi 2144; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2145; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2146; AVX512F-NEXT: vzeroupper 2147; AVX512F-NEXT: retq 2148; 2149; AVX512VL-LABEL: store_merge_split: 2150; AVX512VL: # %bb.0: 2151; AVX512VL-NEXT: shlq $4, %rdi 2152; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2153; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2154; AVX512VL-NEXT: vzeroupper 2155; AVX512VL-NEXT: retq 2156; 2157; AVX512BW-LABEL: store_merge_split: 2158; AVX512BW: # %bb.0: 2159; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2160; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2161; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 2162; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 2163; AVX512BW-NEXT: shlq $4, %rdi 2164; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2165; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2166; AVX512BW-NEXT: vzeroupper 2167; AVX512BW-NEXT: retq 2168; 2169; AVX512BWVL-LABEL: store_merge_split: 2170; AVX512BWVL: # %bb.0: 2171; AVX512BWVL-NEXT: shlq $4, %rdi 2172; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2173; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2174; AVX512BWVL-NEXT: vzeroupper 2175; AVX512BWVL-NEXT: retq 2176 %t1 = trunc <8 x i32> %w1 to <8 x i16> 2177 %t2 = trunc <8 x i32> %w2 to <8 x i16> 2178 %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx 2179 %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1 2180 store <8 x i16> %t1, <8 x i16>* %g1, align 2 2181 store <8 x i16> %t2, <8 x i16>* %g2, align 2 2182 ret void 2183} 2184