1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 16 17define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { 18; SSE-LABEL: trunc8i64_8i32: 19; SSE: # %bb.0: # %entry 20; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 21; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 22; SSE-NEXT: movaps %xmm2, %xmm1 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc8i64_8i32: 26; AVX1: # %bb.0: # %entry 27; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 28; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 29; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 30; AVX1-NEXT: retq 31; 32; AVX2-SLOW-LABEL: trunc8i64_8i32: 33; AVX2-SLOW: # %bb.0: # %entry 34; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 36; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 37; AVX2-SLOW-NEXT: retq 38; 39; AVX2-FAST-ALL-LABEL: trunc8i64_8i32: 40; AVX2-FAST-ALL: # %bb.0: # %entry 41; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 42; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 43; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 44; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 45; AVX2-FAST-ALL-NEXT: retq 46; 47; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32: 48; AVX2-FAST-PERLANE: # %bb.0: # %entry 49; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 50; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 51; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 52; AVX2-FAST-PERLANE-NEXT: retq 53; 54; AVX512-LABEL: trunc8i64_8i32: 55; AVX512: # %bb.0: # %entry 56; AVX512-NEXT: vpmovqd %zmm0, %ymm0 57; AVX512-NEXT: retq 58entry: 59 %0 = trunc <8 x i64> %a to <8 x i32> 60 ret <8 x i32> %0 61} 62 63define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { 64; SSE-LABEL: trunc8i64_8i32_ashr: 65; SSE: # %bb.0: # %entry 66; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 67; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 68; SSE-NEXT: movaps %xmm2, %xmm1 69; SSE-NEXT: retq 70; 71; AVX1-LABEL: trunc8i64_8i32_ashr: 72; AVX1: # %bb.0: # %entry 73; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 74; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 75; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 76; AVX1-NEXT: retq 77; 78; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: 79; AVX2-SLOW: # %bb.0: # %entry 80; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 81; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 82; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 83; AVX2-SLOW-NEXT: retq 84; 85; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr: 86; AVX2-FAST-ALL: # %bb.0: # %entry 87; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 88; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 89; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 90; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 91; AVX2-FAST-ALL-NEXT: retq 92; 93; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr: 94; AVX2-FAST-PERLANE: # %bb.0: # %entry 95; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 96; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 97; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 98; AVX2-FAST-PERLANE-NEXT: retq 99; 100; AVX512-LABEL: trunc8i64_8i32_ashr: 101; AVX512: # %bb.0: # %entry 102; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 103; AVX512-NEXT: vpmovqd %zmm0, %ymm0 104; AVX512-NEXT: retq 105entry: 106 %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 107 %1 = trunc <8 x i64> %0 to <8 x i32> 108 ret <8 x i32> %1 109} 110 111define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { 112; SSE-LABEL: trunc8i64_8i32_lshr: 113; SSE: # %bb.0: # %entry 114; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 115; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] 116; SSE-NEXT: movaps %xmm2, %xmm1 117; SSE-NEXT: retq 118; 119; AVX1-LABEL: trunc8i64_8i32_lshr: 120; AVX1: # %bb.0: # %entry 121; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 122; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 123; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 124; AVX1-NEXT: retq 125; 126; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: 127; AVX2-SLOW: # %bb.0: # %entry 128; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 129; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 130; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 131; AVX2-SLOW-NEXT: retq 132; 133; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr: 134; AVX2-FAST-ALL: # %bb.0: # %entry 135; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7] 136; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 137; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 138; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 139; AVX2-FAST-ALL-NEXT: retq 140; 141; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr: 142; AVX2-FAST-PERLANE: # %bb.0: # %entry 143; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 144; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 145; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] 146; AVX2-FAST-PERLANE-NEXT: retq 147; 148; AVX512-LABEL: trunc8i64_8i32_lshr: 149; AVX512: # %bb.0: # %entry 150; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 151; AVX512-NEXT: vpmovqd %zmm0, %ymm0 152; AVX512-NEXT: retq 153entry: 154 %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32> 155 %1 = trunc <8 x i64> %0 to <8 x i32> 156 ret <8 x i32> %1 157} 158 159define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { 160; SSE2-LABEL: trunc8i64_8i16: 161; SSE2: # %bb.0: # %entry 162; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 163; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 164; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 165; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 166; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 167; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 168; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 169; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 170; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 171; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 172; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 173; SSE2-NEXT: retq 174; 175; SSSE3-LABEL: trunc8i64_8i16: 176; SSSE3: # %bb.0: # %entry 177; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 178; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 179; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 180; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 181; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 182; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 183; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 184; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 185; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 186; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 187; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 188; SSSE3-NEXT: retq 189; 190; SSE41-LABEL: trunc8i64_8i16: 191; SSE41: # %bb.0: # %entry 192; SSE41-NEXT: pxor %xmm4, %xmm4 193; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 194; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 195; SSE41-NEXT: packusdw %xmm3, %xmm2 196; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 197; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 198; SSE41-NEXT: packusdw %xmm1, %xmm0 199; SSE41-NEXT: packusdw %xmm2, %xmm0 200; SSE41-NEXT: retq 201; 202; AVX1-LABEL: trunc8i64_8i16: 203; AVX1: # %bb.0: # %entry 204; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 205; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 206; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 207; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 208; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 209; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 210; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 211; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 212; AVX1-NEXT: vzeroupper 213; AVX1-NEXT: retq 214; 215; AVX2-LABEL: trunc8i64_8i16: 216; AVX2: # %bb.0: # %entry 217; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 218; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 219; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 220; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 221; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 222; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 223; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 224; AVX2-NEXT: vzeroupper 225; AVX2-NEXT: retq 226; 227; AVX512-LABEL: trunc8i64_8i16: 228; AVX512: # %bb.0: # %entry 229; AVX512-NEXT: vpmovqw %zmm0, %xmm0 230; AVX512-NEXT: vzeroupper 231; AVX512-NEXT: retq 232entry: 233 %0 = trunc <8 x i64> %a to <8 x i16> 234 ret <8 x i16> %0 235} 236 237define void @trunc8i64_8i8(<8 x i64> %a) { 238; SSE2-LABEL: trunc8i64_8i8: 239; SSE2: # %bb.0: # %entry 240; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 241; SSE2-NEXT: pand %xmm4, %xmm3 242; SSE2-NEXT: pand %xmm4, %xmm2 243; SSE2-NEXT: packuswb %xmm3, %xmm2 244; SSE2-NEXT: pand %xmm4, %xmm1 245; SSE2-NEXT: pand %xmm4, %xmm0 246; SSE2-NEXT: packuswb %xmm1, %xmm0 247; SSE2-NEXT: packuswb %xmm2, %xmm0 248; SSE2-NEXT: packuswb %xmm0, %xmm0 249; SSE2-NEXT: movq %xmm0, (%rax) 250; SSE2-NEXT: retq 251; 252; SSSE3-LABEL: trunc8i64_8i8: 253; SSSE3: # %bb.0: # %entry 254; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 255; SSSE3-NEXT: pand %xmm4, %xmm3 256; SSSE3-NEXT: pand %xmm4, %xmm2 257; SSSE3-NEXT: packuswb %xmm3, %xmm2 258; SSSE3-NEXT: pand %xmm4, %xmm1 259; SSSE3-NEXT: pand %xmm4, %xmm0 260; SSSE3-NEXT: packuswb %xmm1, %xmm0 261; SSSE3-NEXT: packuswb %xmm2, %xmm0 262; SSSE3-NEXT: packuswb %xmm0, %xmm0 263; SSSE3-NEXT: movq %xmm0, (%rax) 264; SSSE3-NEXT: retq 265; 266; SSE41-LABEL: trunc8i64_8i8: 267; SSE41: # %bb.0: # %entry 268; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 269; SSE41-NEXT: pand %xmm4, %xmm3 270; SSE41-NEXT: pand %xmm4, %xmm2 271; SSE41-NEXT: packusdw %xmm3, %xmm2 272; SSE41-NEXT: pand %xmm4, %xmm1 273; SSE41-NEXT: pand %xmm4, %xmm0 274; SSE41-NEXT: packusdw %xmm1, %xmm0 275; SSE41-NEXT: packusdw %xmm2, %xmm0 276; SSE41-NEXT: packuswb %xmm0, %xmm0 277; SSE41-NEXT: movq %xmm0, (%rax) 278; SSE41-NEXT: retq 279; 280; AVX1-LABEL: trunc8i64_8i8: 281; AVX1: # %bb.0: # %entry 282; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] 283; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 284; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 285; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 286; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 287; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 288; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 289; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 290; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 291; AVX1-NEXT: vmovq %xmm0, (%rax) 292; AVX1-NEXT: vzeroupper 293; AVX1-NEXT: retq 294; 295; AVX2-LABEL: trunc8i64_8i8: 296; AVX2: # %bb.0: # %entry 297; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 298; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 299; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 300; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 302; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 303; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 304; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 305; AVX2-NEXT: vmovq %xmm0, (%rax) 306; AVX2-NEXT: vzeroupper 307; AVX2-NEXT: retq 308; 309; AVX512-LABEL: trunc8i64_8i8: 310; AVX512: # %bb.0: # %entry 311; AVX512-NEXT: vpmovqb %zmm0, (%rax) 312; AVX512-NEXT: vzeroupper 313; AVX512-NEXT: retq 314entry: 315 %0 = trunc <8 x i64> %a to <8 x i8> 316 store <8 x i8> %0, ptr undef, align 4 317 ret void 318} 319 320define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { 321; SSE2-LABEL: trunc8i32_8i16: 322; SSE2: # %bb.0: # %entry 323; SSE2-NEXT: pslld $16, %xmm1 324; SSE2-NEXT: psrad $16, %xmm1 325; SSE2-NEXT: pslld $16, %xmm0 326; SSE2-NEXT: psrad $16, %xmm0 327; SSE2-NEXT: packssdw %xmm1, %xmm0 328; SSE2-NEXT: retq 329; 330; SSSE3-LABEL: trunc8i32_8i16: 331; SSSE3: # %bb.0: # %entry 332; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 333; SSSE3-NEXT: pshufb %xmm2, %xmm1 334; SSSE3-NEXT: pshufb %xmm2, %xmm0 335; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 336; SSSE3-NEXT: retq 337; 338; SSE41-LABEL: trunc8i32_8i16: 339; SSE41: # %bb.0: # %entry 340; SSE41-NEXT: pxor %xmm2, %xmm2 341; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 342; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 343; SSE41-NEXT: packusdw %xmm1, %xmm0 344; SSE41-NEXT: retq 345; 346; AVX1-LABEL: trunc8i32_8i16: 347; AVX1: # %bb.0: # %entry 348; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 349; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 350; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 351; AVX1-NEXT: vzeroupper 352; AVX1-NEXT: retq 353; 354; AVX2-LABEL: trunc8i32_8i16: 355; AVX2: # %bb.0: # %entry 356; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 357; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 358; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 359; AVX2-NEXT: vzeroupper 360; AVX2-NEXT: retq 361; 362; AVX512F-LABEL: trunc8i32_8i16: 363; AVX512F: # %bb.0: # %entry 364; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 365; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 366; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 367; AVX512F-NEXT: vzeroupper 368; AVX512F-NEXT: retq 369; 370; AVX512VL-LABEL: trunc8i32_8i16: 371; AVX512VL: # %bb.0: # %entry 372; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 373; AVX512VL-NEXT: vzeroupper 374; AVX512VL-NEXT: retq 375; 376; AVX512BW-LABEL: trunc8i32_8i16: 377; AVX512BW: # %bb.0: # %entry 378; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 379; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 380; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 381; AVX512BW-NEXT: vzeroupper 382; AVX512BW-NEXT: retq 383; 384; AVX512BWVL-LABEL: trunc8i32_8i16: 385; AVX512BWVL: # %bb.0: # %entry 386; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 387; AVX512BWVL-NEXT: vzeroupper 388; AVX512BWVL-NEXT: retq 389entry: 390 %0 = trunc <8 x i32> %a to <8 x i16> 391 ret <8 x i16> %0 392} 393 394define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { 395; SSE2-LABEL: trunc8i32_8i16_ashr: 396; SSE2: # %bb.0: # %entry 397; SSE2-NEXT: psrad $16, %xmm1 398; SSE2-NEXT: psrad $16, %xmm0 399; SSE2-NEXT: packssdw %xmm1, %xmm0 400; SSE2-NEXT: retq 401; 402; SSSE3-LABEL: trunc8i32_8i16_ashr: 403; SSSE3: # %bb.0: # %entry 404; SSSE3-NEXT: psrad $16, %xmm1 405; SSSE3-NEXT: psrad $16, %xmm0 406; SSSE3-NEXT: packssdw %xmm1, %xmm0 407; SSSE3-NEXT: retq 408; 409; SSE41-LABEL: trunc8i32_8i16_ashr: 410; SSE41: # %bb.0: # %entry 411; SSE41-NEXT: psrld $16, %xmm1 412; SSE41-NEXT: psrld $16, %xmm0 413; SSE41-NEXT: packusdw %xmm1, %xmm0 414; SSE41-NEXT: retq 415; 416; AVX1-LABEL: trunc8i32_8i16_ashr: 417; AVX1: # %bb.0: # %entry 418; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 419; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 420; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 421; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 422; AVX1-NEXT: vzeroupper 423; AVX1-NEXT: retq 424; 425; AVX2-LABEL: trunc8i32_8i16_ashr: 426; AVX2: # %bb.0: # %entry 427; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 428; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 429; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 430; AVX2-NEXT: vzeroupper 431; AVX2-NEXT: retq 432; 433; AVX512F-LABEL: trunc8i32_8i16_ashr: 434; AVX512F: # %bb.0: # %entry 435; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 436; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 437; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 438; AVX512F-NEXT: vzeroupper 439; AVX512F-NEXT: retq 440; 441; AVX512VL-LABEL: trunc8i32_8i16_ashr: 442; AVX512VL: # %bb.0: # %entry 443; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 444; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 445; AVX512VL-NEXT: vzeroupper 446; AVX512VL-NEXT: retq 447; 448; AVX512BW-LABEL: trunc8i32_8i16_ashr: 449; AVX512BW: # %bb.0: # %entry 450; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 451; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 452; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 453; AVX512BW-NEXT: vzeroupper 454; AVX512BW-NEXT: retq 455; 456; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: 457; AVX512BWVL: # %bb.0: # %entry 458; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 459; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 460; AVX512BWVL-NEXT: vzeroupper 461; AVX512BWVL-NEXT: retq 462entry: 463 %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 464 %1 = trunc <8 x i32> %0 to <8 x i16> 465 ret <8 x i16> %1 466} 467 468define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { 469; SSE2-LABEL: trunc8i32_8i16_lshr: 470; SSE2: # %bb.0: # %entry 471; SSE2-NEXT: psrad $16, %xmm1 472; SSE2-NEXT: psrad $16, %xmm0 473; SSE2-NEXT: packssdw %xmm1, %xmm0 474; SSE2-NEXT: retq 475; 476; SSSE3-LABEL: trunc8i32_8i16_lshr: 477; SSSE3: # %bb.0: # %entry 478; SSSE3-NEXT: psrad $16, %xmm1 479; SSSE3-NEXT: psrad $16, %xmm0 480; SSSE3-NEXT: packssdw %xmm1, %xmm0 481; SSSE3-NEXT: retq 482; 483; SSE41-LABEL: trunc8i32_8i16_lshr: 484; SSE41: # %bb.0: # %entry 485; SSE41-NEXT: psrld $16, %xmm1 486; SSE41-NEXT: psrld $16, %xmm0 487; SSE41-NEXT: packusdw %xmm1, %xmm0 488; SSE41-NEXT: retq 489; 490; AVX1-LABEL: trunc8i32_8i16_lshr: 491; AVX1: # %bb.0: # %entry 492; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 493; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 494; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 495; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 496; AVX1-NEXT: vzeroupper 497; AVX1-NEXT: retq 498; 499; AVX2-LABEL: trunc8i32_8i16_lshr: 500; AVX2: # %bb.0: # %entry 501; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 502; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 503; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 504; AVX2-NEXT: vzeroupper 505; AVX2-NEXT: retq 506; 507; AVX512F-LABEL: trunc8i32_8i16_lshr: 508; AVX512F: # %bb.0: # %entry 509; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 510; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 511; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 512; AVX512F-NEXT: vzeroupper 513; AVX512F-NEXT: retq 514; 515; AVX512VL-LABEL: trunc8i32_8i16_lshr: 516; AVX512VL: # %bb.0: # %entry 517; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 518; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 519; AVX512VL-NEXT: vzeroupper 520; AVX512VL-NEXT: retq 521; 522; AVX512BW-LABEL: trunc8i32_8i16_lshr: 523; AVX512BW: # %bb.0: # %entry 524; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 525; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 526; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 527; AVX512BW-NEXT: vzeroupper 528; AVX512BW-NEXT: retq 529; 530; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: 531; AVX512BWVL: # %bb.0: # %entry 532; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 533; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 534; AVX512BWVL-NEXT: vzeroupper 535; AVX512BWVL-NEXT: retq 536entry: 537 %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 538 %1 = trunc <8 x i32> %0 to <8 x i16> 539 ret <8 x i16> %1 540} 541 542define void @trunc8i32_8i8(<8 x i32> %a) { 543; SSE2-LABEL: trunc8i32_8i8: 544; SSE2: # %bb.0: # %entry 545; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 546; SSE2-NEXT: pand %xmm2, %xmm1 547; SSE2-NEXT: pand %xmm2, %xmm0 548; SSE2-NEXT: packuswb %xmm1, %xmm0 549; SSE2-NEXT: packuswb %xmm0, %xmm0 550; SSE2-NEXT: movq %xmm0, (%rax) 551; SSE2-NEXT: retq 552; 553; SSSE3-LABEL: trunc8i32_8i8: 554; SSSE3: # %bb.0: # %entry 555; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 556; SSSE3-NEXT: pshufb %xmm2, %xmm1 557; SSSE3-NEXT: pshufb %xmm2, %xmm0 558; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 559; SSSE3-NEXT: movq %xmm0, (%rax) 560; SSSE3-NEXT: retq 561; 562; SSE41-LABEL: trunc8i32_8i8: 563; SSE41: # %bb.0: # %entry 564; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 565; SSE41-NEXT: pshufb %xmm2, %xmm1 566; SSE41-NEXT: pshufb %xmm2, %xmm0 567; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 568; SSE41-NEXT: movq %xmm0, (%rax) 569; SSE41-NEXT: retq 570; 571; AVX1-LABEL: trunc8i32_8i8: 572; AVX1: # %bb.0: # %entry 573; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 574; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 575; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 576; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 577; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 578; AVX1-NEXT: vmovq %xmm0, (%rax) 579; AVX1-NEXT: vzeroupper 580; AVX1-NEXT: retq 581; 582; AVX2-LABEL: trunc8i32_8i8: 583; AVX2: # %bb.0: # %entry 584; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 585; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> 586; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 587; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 588; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 589; AVX2-NEXT: vmovq %xmm0, (%rax) 590; AVX2-NEXT: vzeroupper 591; AVX2-NEXT: retq 592; 593; AVX512F-LABEL: trunc8i32_8i8: 594; AVX512F: # %bb.0: # %entry 595; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 596; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 597; AVX512F-NEXT: vmovq %xmm0, (%rax) 598; AVX512F-NEXT: vzeroupper 599; AVX512F-NEXT: retq 600; 601; AVX512VL-LABEL: trunc8i32_8i8: 602; AVX512VL: # %bb.0: # %entry 603; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) 604; AVX512VL-NEXT: vzeroupper 605; AVX512VL-NEXT: retq 606; 607; AVX512BW-LABEL: trunc8i32_8i8: 608; AVX512BW: # %bb.0: # %entry 609; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 610; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 611; AVX512BW-NEXT: vmovq %xmm0, (%rax) 612; AVX512BW-NEXT: vzeroupper 613; AVX512BW-NEXT: retq 614; 615; AVX512BWVL-LABEL: trunc8i32_8i8: 616; AVX512BWVL: # %bb.0: # %entry 617; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) 618; AVX512BWVL-NEXT: vzeroupper 619; AVX512BWVL-NEXT: retq 620entry: 621 %0 = trunc <8 x i32> %a to <8 x i8> 622 store <8 x i8> %0, ptr undef, align 4 623 ret void 624} 625 626define void @trunc16i32_16i16(<16 x i32> %a) { 627; SSE2-LABEL: trunc16i32_16i16: 628; SSE2: # %bb.0: # %entry 629; SSE2-NEXT: pslld $16, %xmm1 630; SSE2-NEXT: psrad $16, %xmm1 631; SSE2-NEXT: pslld $16, %xmm0 632; SSE2-NEXT: psrad $16, %xmm0 633; SSE2-NEXT: packssdw %xmm1, %xmm0 634; SSE2-NEXT: pslld $16, %xmm3 635; SSE2-NEXT: psrad $16, %xmm3 636; SSE2-NEXT: pslld $16, %xmm2 637; SSE2-NEXT: psrad $16, %xmm2 638; SSE2-NEXT: packssdw %xmm3, %xmm2 639; SSE2-NEXT: movdqu %xmm2, (%rax) 640; SSE2-NEXT: movdqu %xmm0, (%rax) 641; SSE2-NEXT: retq 642; 643; SSSE3-LABEL: trunc16i32_16i16: 644; SSSE3: # %bb.0: # %entry 645; SSSE3-NEXT: pslld $16, %xmm1 646; SSSE3-NEXT: psrad $16, %xmm1 647; SSSE3-NEXT: pslld $16, %xmm0 648; SSSE3-NEXT: psrad $16, %xmm0 649; SSSE3-NEXT: packssdw %xmm1, %xmm0 650; SSSE3-NEXT: pslld $16, %xmm3 651; SSSE3-NEXT: psrad $16, %xmm3 652; SSSE3-NEXT: pslld $16, %xmm2 653; SSSE3-NEXT: psrad $16, %xmm2 654; SSSE3-NEXT: packssdw %xmm3, %xmm2 655; SSSE3-NEXT: movdqu %xmm2, (%rax) 656; SSSE3-NEXT: movdqu %xmm0, (%rax) 657; SSSE3-NEXT: retq 658; 659; SSE41-LABEL: trunc16i32_16i16: 660; SSE41: # %bb.0: # %entry 661; SSE41-NEXT: pxor %xmm4, %xmm4 662; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 663; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 664; SSE41-NEXT: packusdw %xmm1, %xmm0 665; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 666; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 667; SSE41-NEXT: packusdw %xmm3, %xmm2 668; SSE41-NEXT: movdqu %xmm2, (%rax) 669; SSE41-NEXT: movdqu %xmm0, (%rax) 670; SSE41-NEXT: retq 671; 672; AVX1-LABEL: trunc16i32_16i16: 673; AVX1: # %bb.0: # %entry 674; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 675; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 676; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 677; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 678; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 679; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 680; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 681; AVX1-NEXT: vmovdqu %xmm1, (%rax) 682; AVX1-NEXT: vmovdqu %xmm0, (%rax) 683; AVX1-NEXT: vzeroupper 684; AVX1-NEXT: retq 685; 686; AVX2-LABEL: trunc16i32_16i16: 687; AVX2: # %bb.0: # %entry 688; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 689; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 690; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 691; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 692; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 693; AVX2-NEXT: vmovdqu %ymm0, (%rax) 694; AVX2-NEXT: vzeroupper 695; AVX2-NEXT: retq 696; 697; AVX512-LABEL: trunc16i32_16i16: 698; AVX512: # %bb.0: # %entry 699; AVX512-NEXT: vpmovdw %zmm0, (%rax) 700; AVX512-NEXT: vzeroupper 701; AVX512-NEXT: retq 702entry: 703 %0 = trunc <16 x i32> %a to <16 x i16> 704 store <16 x i16> %0, ptr undef, align 4 705 ret void 706} 707 708define void @trunc16i32_16i16_ashr(<16 x i32> %a) { 709; SSE2-LABEL: trunc16i32_16i16_ashr: 710; SSE2: # %bb.0: # %entry 711; SSE2-NEXT: psrad $16, %xmm1 712; SSE2-NEXT: psrad $16, %xmm0 713; SSE2-NEXT: packssdw %xmm1, %xmm0 714; SSE2-NEXT: psrad $16, %xmm3 715; SSE2-NEXT: psrad $16, %xmm2 716; SSE2-NEXT: packssdw %xmm3, %xmm2 717; SSE2-NEXT: movdqu %xmm2, (%rax) 718; SSE2-NEXT: movdqu %xmm0, (%rax) 719; SSE2-NEXT: retq 720; 721; SSSE3-LABEL: trunc16i32_16i16_ashr: 722; SSSE3: # %bb.0: # %entry 723; SSSE3-NEXT: psrad $16, %xmm1 724; SSSE3-NEXT: psrad $16, %xmm0 725; SSSE3-NEXT: packssdw %xmm1, %xmm0 726; SSSE3-NEXT: psrad $16, %xmm3 727; SSSE3-NEXT: psrad $16, %xmm2 728; SSSE3-NEXT: packssdw %xmm3, %xmm2 729; SSSE3-NEXT: movdqu %xmm2, (%rax) 730; SSSE3-NEXT: movdqu %xmm0, (%rax) 731; SSSE3-NEXT: retq 732; 733; SSE41-LABEL: trunc16i32_16i16_ashr: 734; SSE41: # %bb.0: # %entry 735; SSE41-NEXT: psrld $16, %xmm3 736; SSE41-NEXT: psrld $16, %xmm2 737; SSE41-NEXT: packusdw %xmm3, %xmm2 738; SSE41-NEXT: psrld $16, %xmm1 739; SSE41-NEXT: psrld $16, %xmm0 740; SSE41-NEXT: packusdw %xmm1, %xmm0 741; SSE41-NEXT: movdqu %xmm2, (%rax) 742; SSE41-NEXT: movdqu %xmm0, (%rax) 743; SSE41-NEXT: retq 744; 745; AVX1-LABEL: trunc16i32_16i16_ashr: 746; AVX1: # %bb.0: # %entry 747; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 748; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 749; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 750; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 751; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 752; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 753; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 754; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 755; AVX1-NEXT: vmovdqu %xmm1, (%rax) 756; AVX1-NEXT: vmovdqu %xmm0, (%rax) 757; AVX1-NEXT: vzeroupper 758; AVX1-NEXT: retq 759; 760; AVX2-LABEL: trunc16i32_16i16_ashr: 761; AVX2: # %bb.0: # %entry 762; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 763; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 764; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 765; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 766; AVX2-NEXT: vmovdqu %ymm0, (%rax) 767; AVX2-NEXT: vzeroupper 768; AVX2-NEXT: retq 769; 770; AVX512-LABEL: trunc16i32_16i16_ashr: 771; AVX512: # %bb.0: # %entry 772; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 773; AVX512-NEXT: vpmovdw %zmm0, (%rax) 774; AVX512-NEXT: vzeroupper 775; AVX512-NEXT: retq 776entry: 777 %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 778 %1 = trunc <16 x i32> %0 to <16 x i16> 779 store <16 x i16> %1, ptr undef, align 4 780 ret void 781} 782 783define void @trunc16i32_16i16_lshr(<16 x i32> %a) { 784; SSE2-LABEL: trunc16i32_16i16_lshr: 785; SSE2: # %bb.0: # %entry 786; SSE2-NEXT: psrad $16, %xmm1 787; SSE2-NEXT: psrad $16, %xmm0 788; SSE2-NEXT: packssdw %xmm1, %xmm0 789; SSE2-NEXT: psrad $16, %xmm3 790; SSE2-NEXT: psrad $16, %xmm2 791; SSE2-NEXT: packssdw %xmm3, %xmm2 792; SSE2-NEXT: movdqu %xmm2, (%rax) 793; SSE2-NEXT: movdqu %xmm0, (%rax) 794; SSE2-NEXT: retq 795; 796; SSSE3-LABEL: trunc16i32_16i16_lshr: 797; SSSE3: # %bb.0: # %entry 798; SSSE3-NEXT: psrad $16, %xmm1 799; SSSE3-NEXT: psrad $16, %xmm0 800; SSSE3-NEXT: packssdw %xmm1, %xmm0 801; SSSE3-NEXT: psrad $16, %xmm3 802; SSSE3-NEXT: psrad $16, %xmm2 803; SSSE3-NEXT: packssdw %xmm3, %xmm2 804; SSSE3-NEXT: movdqu %xmm2, (%rax) 805; SSSE3-NEXT: movdqu %xmm0, (%rax) 806; SSSE3-NEXT: retq 807; 808; SSE41-LABEL: trunc16i32_16i16_lshr: 809; SSE41: # %bb.0: # %entry 810; SSE41-NEXT: psrld $16, %xmm3 811; SSE41-NEXT: psrld $16, %xmm2 812; SSE41-NEXT: packusdw %xmm3, %xmm2 813; SSE41-NEXT: psrld $16, %xmm1 814; SSE41-NEXT: psrld $16, %xmm0 815; SSE41-NEXT: packusdw %xmm1, %xmm0 816; SSE41-NEXT: movdqu %xmm2, (%rax) 817; SSE41-NEXT: movdqu %xmm0, (%rax) 818; SSE41-NEXT: retq 819; 820; AVX1-LABEL: trunc16i32_16i16_lshr: 821; AVX1: # %bb.0: # %entry 822; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 823; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 824; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 825; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 826; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 827; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 828; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 829; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 830; AVX1-NEXT: vmovdqu %xmm1, (%rax) 831; AVX1-NEXT: vmovdqu %xmm0, (%rax) 832; AVX1-NEXT: vzeroupper 833; AVX1-NEXT: retq 834; 835; AVX2-LABEL: trunc16i32_16i16_lshr: 836; AVX2: # %bb.0: # %entry 837; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 838; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 839; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 840; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 841; AVX2-NEXT: vmovdqu %ymm0, (%rax) 842; AVX2-NEXT: vzeroupper 843; AVX2-NEXT: retq 844; 845; AVX512-LABEL: trunc16i32_16i16_lshr: 846; AVX512: # %bb.0: # %entry 847; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 848; AVX512-NEXT: vpmovdw %zmm0, (%rax) 849; AVX512-NEXT: vzeroupper 850; AVX512-NEXT: retq 851entry: 852 %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 853 %1 = trunc <16 x i32> %0 to <16 x i16> 854 store <16 x i16> %1, ptr undef, align 4 855 ret void 856} 857 858define void @trunc16i32_16i8(<16 x i32> %a) { 859; SSE2-LABEL: trunc16i32_16i8: 860; SSE2: # %bb.0: # %entry 861; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 862; SSE2-NEXT: pand %xmm4, %xmm3 863; SSE2-NEXT: pand %xmm4, %xmm2 864; SSE2-NEXT: packuswb %xmm3, %xmm2 865; SSE2-NEXT: pand %xmm4, %xmm1 866; SSE2-NEXT: pand %xmm4, %xmm0 867; SSE2-NEXT: packuswb %xmm1, %xmm0 868; SSE2-NEXT: packuswb %xmm2, %xmm0 869; SSE2-NEXT: movdqu %xmm0, (%rax) 870; SSE2-NEXT: retq 871; 872; SSSE3-LABEL: trunc16i32_16i8: 873; SSSE3: # %bb.0: # %entry 874; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 875; SSSE3-NEXT: pand %xmm4, %xmm3 876; SSSE3-NEXT: pand %xmm4, %xmm2 877; SSSE3-NEXT: packuswb %xmm3, %xmm2 878; SSSE3-NEXT: pand %xmm4, %xmm1 879; SSSE3-NEXT: pand %xmm4, %xmm0 880; SSSE3-NEXT: packuswb %xmm1, %xmm0 881; SSSE3-NEXT: packuswb %xmm2, %xmm0 882; SSSE3-NEXT: movdqu %xmm0, (%rax) 883; SSSE3-NEXT: retq 884; 885; SSE41-LABEL: trunc16i32_16i8: 886; SSE41: # %bb.0: # %entry 887; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 888; SSE41-NEXT: pand %xmm4, %xmm3 889; SSE41-NEXT: pand %xmm4, %xmm2 890; SSE41-NEXT: packusdw %xmm3, %xmm2 891; SSE41-NEXT: pand %xmm4, %xmm1 892; SSE41-NEXT: pand %xmm4, %xmm0 893; SSE41-NEXT: packusdw %xmm1, %xmm0 894; SSE41-NEXT: packuswb %xmm2, %xmm0 895; SSE41-NEXT: movdqu %xmm0, (%rax) 896; SSE41-NEXT: retq 897; 898; AVX1-LABEL: trunc16i32_16i8: 899; AVX1: # %bb.0: # %entry 900; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 901; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 902; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 903; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 904; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 905; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 906; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 907; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 908; AVX1-NEXT: vmovdqu %xmm0, (%rax) 909; AVX1-NEXT: vzeroupper 910; AVX1-NEXT: retq 911; 912; AVX2-LABEL: trunc16i32_16i8: 913; AVX2: # %bb.0: # %entry 914; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 915; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 916; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 917; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 918; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 919; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 920; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 921; AVX2-NEXT: vmovdqu %xmm0, (%rax) 922; AVX2-NEXT: vzeroupper 923; AVX2-NEXT: retq 924; 925; AVX512-LABEL: trunc16i32_16i8: 926; AVX512: # %bb.0: # %entry 927; AVX512-NEXT: vpmovdb %zmm0, (%rax) 928; AVX512-NEXT: vzeroupper 929; AVX512-NEXT: retq 930entry: 931 %0 = trunc <16 x i32> %a to <16 x i8> 932 store <16 x i8> %0, ptr undef, align 4 933 ret void 934} 935 936define void @trunc16i32_16i8_ashr(<16 x i32> %a) { 937; SSE2-LABEL: trunc16i32_16i8_ashr: 938; SSE2: # %bb.0: # %entry 939; SSE2-NEXT: psrld $24, %xmm1 940; SSE2-NEXT: psrld $24, %xmm0 941; SSE2-NEXT: packuswb %xmm1, %xmm0 942; SSE2-NEXT: psrld $24, %xmm3 943; SSE2-NEXT: psrld $24, %xmm2 944; SSE2-NEXT: packuswb %xmm3, %xmm2 945; SSE2-NEXT: packuswb %xmm2, %xmm0 946; SSE2-NEXT: movdqu %xmm0, (%rax) 947; SSE2-NEXT: retq 948; 949; SSSE3-LABEL: trunc16i32_16i8_ashr: 950; SSSE3: # %bb.0: # %entry 951; SSSE3-NEXT: psrld $24, %xmm1 952; SSSE3-NEXT: psrld $24, %xmm0 953; SSSE3-NEXT: packuswb %xmm1, %xmm0 954; SSSE3-NEXT: psrld $24, %xmm3 955; SSSE3-NEXT: psrld $24, %xmm2 956; SSSE3-NEXT: packuswb %xmm3, %xmm2 957; SSSE3-NEXT: packuswb %xmm2, %xmm0 958; SSSE3-NEXT: movdqu %xmm0, (%rax) 959; SSSE3-NEXT: retq 960; 961; SSE41-LABEL: trunc16i32_16i8_ashr: 962; SSE41: # %bb.0: # %entry 963; SSE41-NEXT: psrld $24, %xmm1 964; SSE41-NEXT: psrld $24, %xmm0 965; SSE41-NEXT: packusdw %xmm1, %xmm0 966; SSE41-NEXT: psrld $24, %xmm3 967; SSE41-NEXT: psrld $24, %xmm2 968; SSE41-NEXT: packusdw %xmm3, %xmm2 969; SSE41-NEXT: packuswb %xmm2, %xmm0 970; SSE41-NEXT: movdqu %xmm0, (%rax) 971; SSE41-NEXT: retq 972; 973; AVX1-LABEL: trunc16i32_16i8_ashr: 974; AVX1: # %bb.0: # %entry 975; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 976; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 977; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 978; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 979; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 980; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 981; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 982; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 983; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 984; AVX1-NEXT: vmovdqu %xmm0, (%rax) 985; AVX1-NEXT: vzeroupper 986; AVX1-NEXT: retq 987; 988; AVX2-LABEL: trunc16i32_16i8_ashr: 989; AVX2: # %bb.0: # %entry 990; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 991; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 992; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 993; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 994; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 995; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 996; AVX2-NEXT: vmovdqu %xmm0, (%rax) 997; AVX2-NEXT: vzeroupper 998; AVX2-NEXT: retq 999; 1000; AVX512-LABEL: trunc16i32_16i8_ashr: 1001; AVX512: # %bb.0: # %entry 1002; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 1003; AVX512-NEXT: vpmovdb %zmm0, (%rax) 1004; AVX512-NEXT: vzeroupper 1005; AVX512-NEXT: retq 1006entry: 1007 %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 1008 %1 = trunc <16 x i32> %0 to <16 x i8> 1009 store <16 x i8> %1, ptr undef, align 4 1010 ret void 1011} 1012 1013define void @trunc16i32_16i8_lshr(<16 x i32> %a) { 1014; SSE2-LABEL: trunc16i32_16i8_lshr: 1015; SSE2: # %bb.0: # %entry 1016; SSE2-NEXT: psrld $24, %xmm1 1017; SSE2-NEXT: psrld $24, %xmm0 1018; SSE2-NEXT: packuswb %xmm1, %xmm0 1019; SSE2-NEXT: psrld $24, %xmm3 1020; SSE2-NEXT: psrld $24, %xmm2 1021; SSE2-NEXT: packuswb %xmm3, %xmm2 1022; SSE2-NEXT: packuswb %xmm2, %xmm0 1023; SSE2-NEXT: movdqu %xmm0, (%rax) 1024; SSE2-NEXT: retq 1025; 1026; SSSE3-LABEL: trunc16i32_16i8_lshr: 1027; SSSE3: # %bb.0: # %entry 1028; SSSE3-NEXT: psrld $24, %xmm1 1029; SSSE3-NEXT: psrld $24, %xmm0 1030; SSSE3-NEXT: packuswb %xmm1, %xmm0 1031; SSSE3-NEXT: psrld $24, %xmm3 1032; SSSE3-NEXT: psrld $24, %xmm2 1033; SSSE3-NEXT: packuswb %xmm3, %xmm2 1034; SSSE3-NEXT: packuswb %xmm2, %xmm0 1035; SSSE3-NEXT: movdqu %xmm0, (%rax) 1036; SSSE3-NEXT: retq 1037; 1038; SSE41-LABEL: trunc16i32_16i8_lshr: 1039; SSE41: # %bb.0: # %entry 1040; SSE41-NEXT: psrld $24, %xmm1 1041; SSE41-NEXT: psrld $24, %xmm0 1042; SSE41-NEXT: packusdw %xmm1, %xmm0 1043; SSE41-NEXT: psrld $24, %xmm3 1044; SSE41-NEXT: psrld $24, %xmm2 1045; SSE41-NEXT: packusdw %xmm3, %xmm2 1046; SSE41-NEXT: packuswb %xmm2, %xmm0 1047; SSE41-NEXT: movdqu %xmm0, (%rax) 1048; SSE41-NEXT: retq 1049; 1050; AVX1-LABEL: trunc16i32_16i8_lshr: 1051; AVX1: # %bb.0: # %entry 1052; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1053; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1054; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 1055; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1056; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1057; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 1058; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 1059; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1060; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1061; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1062; AVX1-NEXT: vzeroupper 1063; AVX1-NEXT: retq 1064; 1065; AVX2-LABEL: trunc16i32_16i8_lshr: 1066; AVX2: # %bb.0: # %entry 1067; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 1068; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 1069; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1070; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1071; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1072; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1073; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1074; AVX2-NEXT: vzeroupper 1075; AVX2-NEXT: retq 1076; 1077; AVX512-LABEL: trunc16i32_16i8_lshr: 1078; AVX512: # %bb.0: # %entry 1079; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 1080; AVX512-NEXT: vpmovdb %zmm0, (%rax) 1081; AVX512-NEXT: vzeroupper 1082; AVX512-NEXT: retq 1083entry: 1084 %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> 1085 %1 = trunc <16 x i32> %0 to <16 x i8> 1086 store <16 x i8> %1, ptr undef, align 4 1087 ret void 1088} 1089 1090;PR25684 1091define void @trunc16i16_16i8(<16 x i16> %a) { 1092; SSE-LABEL: trunc16i16_16i8: 1093; SSE: # %bb.0: # %entry 1094; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1095; SSE-NEXT: pand %xmm2, %xmm1 1096; SSE-NEXT: pand %xmm2, %xmm0 1097; SSE-NEXT: packuswb %xmm1, %xmm0 1098; SSE-NEXT: movdqu %xmm0, (%rax) 1099; SSE-NEXT: retq 1100; 1101; AVX1-LABEL: trunc16i16_16i8: 1102; AVX1: # %bb.0: # %entry 1103; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1105; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1106; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1107; AVX1-NEXT: vzeroupper 1108; AVX1-NEXT: retq 1109; 1110; AVX2-LABEL: trunc16i16_16i8: 1111; AVX2: # %bb.0: # %entry 1112; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1113; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1114; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1115; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1116; AVX2-NEXT: vzeroupper 1117; AVX2-NEXT: retq 1118; 1119; AVX512F-LABEL: trunc16i16_16i8: 1120; AVX512F: # %bb.0: # %entry 1121; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1122; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1123; AVX512F-NEXT: vzeroupper 1124; AVX512F-NEXT: retq 1125; 1126; AVX512VL-LABEL: trunc16i16_16i8: 1127; AVX512VL: # %bb.0: # %entry 1128; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1129; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1130; AVX512VL-NEXT: vzeroupper 1131; AVX512VL-NEXT: retq 1132; 1133; AVX512BW-LABEL: trunc16i16_16i8: 1134; AVX512BW: # %bb.0: # %entry 1135; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1136; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1137; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1138; AVX512BW-NEXT: vzeroupper 1139; AVX512BW-NEXT: retq 1140; 1141; AVX512BWVL-LABEL: trunc16i16_16i8: 1142; AVX512BWVL: # %bb.0: # %entry 1143; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1144; AVX512BWVL-NEXT: vzeroupper 1145; AVX512BWVL-NEXT: retq 1146entry: 1147 %0 = trunc <16 x i16> %a to <16 x i8> 1148 store <16 x i8> %0, ptr undef, align 4 1149 ret void 1150} 1151 1152define void @trunc16i16_16i8_ashr(<16 x i16> %a) { 1153; SSE-LABEL: trunc16i16_16i8_ashr: 1154; SSE: # %bb.0: # %entry 1155; SSE-NEXT: psrlw $8, %xmm1 1156; SSE-NEXT: psrlw $8, %xmm0 1157; SSE-NEXT: packuswb %xmm1, %xmm0 1158; SSE-NEXT: movdqu %xmm0, (%rax) 1159; SSE-NEXT: retq 1160; 1161; AVX1-LABEL: trunc16i16_16i8_ashr: 1162; AVX1: # %bb.0: # %entry 1163; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1164; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1165; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1166; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1167; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1168; AVX1-NEXT: vzeroupper 1169; AVX1-NEXT: retq 1170; 1171; AVX2-LABEL: trunc16i16_16i8_ashr: 1172; AVX2: # %bb.0: # %entry 1173; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1174; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1175; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1176; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1177; AVX2-NEXT: vzeroupper 1178; AVX2-NEXT: retq 1179; 1180; AVX512F-LABEL: trunc16i16_16i8_ashr: 1181; AVX512F: # %bb.0: # %entry 1182; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1183; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1184; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1185; AVX512F-NEXT: vzeroupper 1186; AVX512F-NEXT: retq 1187; 1188; AVX512VL-LABEL: trunc16i16_16i8_ashr: 1189; AVX512VL: # %bb.0: # %entry 1190; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1191; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1192; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1193; AVX512VL-NEXT: vzeroupper 1194; AVX512VL-NEXT: retq 1195; 1196; AVX512BW-LABEL: trunc16i16_16i8_ashr: 1197; AVX512BW: # %bb.0: # %entry 1198; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1199; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1200; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1201; AVX512BW-NEXT: vzeroupper 1202; AVX512BW-NEXT: retq 1203; 1204; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: 1205; AVX512BWVL: # %bb.0: # %entry 1206; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1207; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1208; AVX512BWVL-NEXT: vzeroupper 1209; AVX512BWVL-NEXT: retq 1210entry: 1211 %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1212 %1 = trunc <16 x i16> %0 to <16 x i8> 1213 store <16 x i8> %1, ptr undef, align 4 1214 ret void 1215} 1216 1217define void @trunc16i16_16i8_lshr(<16 x i16> %a) { 1218; SSE-LABEL: trunc16i16_16i8_lshr: 1219; SSE: # %bb.0: # %entry 1220; SSE-NEXT: psrlw $8, %xmm1 1221; SSE-NEXT: psrlw $8, %xmm0 1222; SSE-NEXT: packuswb %xmm1, %xmm0 1223; SSE-NEXT: movdqu %xmm0, (%rax) 1224; SSE-NEXT: retq 1225; 1226; AVX1-LABEL: trunc16i16_16i8_lshr: 1227; AVX1: # %bb.0: # %entry 1228; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1229; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1230; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1231; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1232; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1233; AVX1-NEXT: vzeroupper 1234; AVX1-NEXT: retq 1235; 1236; AVX2-LABEL: trunc16i16_16i8_lshr: 1237; AVX2: # %bb.0: # %entry 1238; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1239; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1240; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1241; AVX2-NEXT: vmovdqu %xmm0, (%rax) 1242; AVX2-NEXT: vzeroupper 1243; AVX2-NEXT: retq 1244; 1245; AVX512F-LABEL: trunc16i16_16i8_lshr: 1246; AVX512F: # %bb.0: # %entry 1247; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1248; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1249; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1250; AVX512F-NEXT: vzeroupper 1251; AVX512F-NEXT: retq 1252; 1253; AVX512VL-LABEL: trunc16i16_16i8_lshr: 1254; AVX512VL: # %bb.0: # %entry 1255; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 1256; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1257; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1258; AVX512VL-NEXT: vzeroupper 1259; AVX512VL-NEXT: retq 1260; 1261; AVX512BW-LABEL: trunc16i16_16i8_lshr: 1262; AVX512BW: # %bb.0: # %entry 1263; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 1264; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1265; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) 1266; AVX512BW-NEXT: vzeroupper 1267; AVX512BW-NEXT: retq 1268; 1269; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: 1270; AVX512BWVL: # %bb.0: # %entry 1271; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 1272; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) 1273; AVX512BWVL-NEXT: vzeroupper 1274; AVX512BWVL-NEXT: retq 1275entry: 1276 %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 1277 %1 = trunc <16 x i16> %0 to <16 x i8> 1278 store <16 x i8> %1, ptr undef, align 4 1279 ret void 1280} 1281 1282define void @trunc32i16_32i8(<32 x i16> %a) { 1283; SSE-LABEL: trunc32i16_32i8: 1284; SSE: # %bb.0: # %entry 1285; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1286; SSE-NEXT: pand %xmm4, %xmm1 1287; SSE-NEXT: pand %xmm4, %xmm0 1288; SSE-NEXT: packuswb %xmm1, %xmm0 1289; SSE-NEXT: pand %xmm4, %xmm3 1290; SSE-NEXT: pand %xmm4, %xmm2 1291; SSE-NEXT: packuswb %xmm3, %xmm2 1292; SSE-NEXT: movdqu %xmm2, (%rax) 1293; SSE-NEXT: movdqu %xmm0, (%rax) 1294; SSE-NEXT: retq 1295; 1296; AVX1-LABEL: trunc32i16_32i8: 1297; AVX1: # %bb.0: # %entry 1298; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1299; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1300; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1301; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 1302; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1303; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1304; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1305; AVX1-NEXT: vmovdqu %xmm1, (%rax) 1306; AVX1-NEXT: vmovdqu %xmm0, (%rax) 1307; AVX1-NEXT: vzeroupper 1308; AVX1-NEXT: retq 1309; 1310; AVX2-LABEL: trunc32i16_32i8: 1311; AVX2: # %bb.0: # %entry 1312; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1313; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1314; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1315; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1316; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1317; AVX2-NEXT: vmovdqu %ymm0, (%rax) 1318; AVX2-NEXT: vzeroupper 1319; AVX2-NEXT: retq 1320; 1321; AVX512F-LABEL: trunc32i16_32i8: 1322; AVX512F: # %bb.0: # %entry 1323; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1324; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1325; AVX512F-NEXT: vpmovdb %zmm1, (%rax) 1326; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1327; AVX512F-NEXT: vpmovdb %zmm0, (%rax) 1328; AVX512F-NEXT: vzeroupper 1329; AVX512F-NEXT: retq 1330; 1331; AVX512VL-LABEL: trunc32i16_32i8: 1332; AVX512VL: # %bb.0: # %entry 1333; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1334; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1335; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) 1336; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1337; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) 1338; AVX512VL-NEXT: vzeroupper 1339; AVX512VL-NEXT: retq 1340; 1341; AVX512BW-LABEL: trunc32i16_32i8: 1342; AVX512BW: # %bb.0: # %entry 1343; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) 1344; AVX512BW-NEXT: vzeroupper 1345; AVX512BW-NEXT: retq 1346; 1347; AVX512BWVL-LABEL: trunc32i16_32i8: 1348; AVX512BWVL: # %bb.0: # %entry 1349; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) 1350; AVX512BWVL-NEXT: vzeroupper 1351; AVX512BWVL-NEXT: retq 1352entry: 1353 %0 = trunc <32 x i16> %a to <32 x i8> 1354 store <32 x i8> %0, ptr undef, align 4 1355 ret void 1356} 1357 1358define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { 1359; SSE-LABEL: trunc2x4i64_8i32: 1360; SSE: # %bb.0: # %entry 1361; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1362; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] 1363; SSE-NEXT: movaps %xmm2, %xmm1 1364; SSE-NEXT: retq 1365; 1366; AVX1-LABEL: trunc2x4i64_8i32: 1367; AVX1: # %bb.0: # %entry 1368; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1369; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1370; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1371; AVX1-NEXT: retq 1372; 1373; AVX2-SLOW-LABEL: trunc2x4i64_8i32: 1374; AVX2-SLOW: # %bb.0: # %entry 1375; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1376; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1377; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1378; AVX2-SLOW-NEXT: retq 1379; 1380; AVX2-FAST-ALL-LABEL: trunc2x4i64_8i32: 1381; AVX2-FAST-ALL: # %bb.0: # %entry 1382; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1383; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 1384; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 1385; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1386; AVX2-FAST-ALL-NEXT: retq 1387; 1388; AVX2-FAST-PERLANE-LABEL: trunc2x4i64_8i32: 1389; AVX2-FAST-PERLANE: # %bb.0: # %entry 1390; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 1391; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1392; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 1393; AVX2-FAST-PERLANE-NEXT: retq 1394; 1395; AVX512F-LABEL: trunc2x4i64_8i32: 1396; AVX512F: # %bb.0: # %entry 1397; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1398; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1399; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1400; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1401; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1402; AVX512F-NEXT: retq 1403; 1404; AVX512VL-LABEL: trunc2x4i64_8i32: 1405; AVX512VL: # %bb.0: # %entry 1406; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1407; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 1408; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1409; AVX512VL-NEXT: retq 1410; 1411; AVX512BW-LABEL: trunc2x4i64_8i32: 1412; AVX512BW: # %bb.0: # %entry 1413; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1414; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1415; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1416; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1417; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1418; AVX512BW-NEXT: retq 1419; 1420; AVX512BWVL-LABEL: trunc2x4i64_8i32: 1421; AVX512BWVL: # %bb.0: # %entry 1422; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1423; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 1424; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1425; AVX512BWVL-NEXT: retq 1426entry: 1427 %0 = trunc <4 x i64> %a to <4 x i32> 1428 %1 = trunc <4 x i64> %b to <4 x i32> 1429 %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1430 ret <8 x i32> %2 1431} 1432 1433define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { 1434; SSE2-LABEL: trunc2x4i64_8i16: 1435; SSE2: # %bb.0: # %entry 1436; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1437; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1438; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1439; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1440; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1441; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1442; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1443; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1444; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1445; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1446; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1447; SSE2-NEXT: retq 1448; 1449; SSSE3-LABEL: trunc2x4i64_8i16: 1450; SSSE3: # %bb.0: # %entry 1451; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1452; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1453; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1454; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1455; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1456; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1457; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1458; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1459; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1460; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1461; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1462; SSSE3-NEXT: retq 1463; 1464; SSE41-LABEL: trunc2x4i64_8i16: 1465; SSE41: # %bb.0: # %entry 1466; SSE41-NEXT: pxor %xmm4, %xmm4 1467; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] 1468; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] 1469; SSE41-NEXT: packusdw %xmm3, %xmm2 1470; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] 1471; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] 1472; SSE41-NEXT: packusdw %xmm1, %xmm0 1473; SSE41-NEXT: packusdw %xmm2, %xmm0 1474; SSE41-NEXT: retq 1475; 1476; AVX1-LABEL: trunc2x4i64_8i16: 1477; AVX1: # %bb.0: # %entry 1478; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1479; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 1480; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1481; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 1482; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 1483; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1484; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 1485; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 1486; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1487; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1488; AVX1-NEXT: vzeroupper 1489; AVX1-NEXT: retq 1490; 1491; AVX2-LABEL: trunc2x4i64_8i16: 1492; AVX2: # %bb.0: # %entry 1493; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1494; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1495; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1496; AVX2-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1497; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1498; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1499; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1500; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1501; AVX2-NEXT: vzeroupper 1502; AVX2-NEXT: retq 1503; 1504; AVX512F-LABEL: trunc2x4i64_8i16: 1505; AVX512F: # %bb.0: # %entry 1506; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1507; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1508; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1509; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1510; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1511; AVX512F-NEXT: vzeroupper 1512; AVX512F-NEXT: retq 1513; 1514; AVX512VL-LABEL: trunc2x4i64_8i16: 1515; AVX512VL: # %bb.0: # %entry 1516; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 1517; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 1518; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1519; AVX512VL-NEXT: vzeroupper 1520; AVX512VL-NEXT: retq 1521; 1522; AVX512BW-LABEL: trunc2x4i64_8i16: 1523; AVX512BW: # %bb.0: # %entry 1524; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1525; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1526; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1527; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1528; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1529; AVX512BW-NEXT: vzeroupper 1530; AVX512BW-NEXT: retq 1531; 1532; AVX512BWVL-LABEL: trunc2x4i64_8i16: 1533; AVX512BWVL: # %bb.0: # %entry 1534; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 1535; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 1536; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1537; AVX512BWVL-NEXT: vzeroupper 1538; AVX512BWVL-NEXT: retq 1539entry: 1540 %0 = trunc <4 x i64> %a to <4 x i16> 1541 %1 = trunc <4 x i64> %b to <4 x i16> 1542 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1543 ret <8 x i16> %2 1544} 1545 1546define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { 1547; SSE-LABEL: trunc2x2i64_4i32: 1548; SSE: # %bb.0: # %entry 1549; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1550; SSE-NEXT: retq 1551; 1552; AVX-LABEL: trunc2x2i64_4i32: 1553; AVX: # %bb.0: # %entry 1554; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1555; AVX-NEXT: retq 1556; 1557; AVX512F-LABEL: trunc2x2i64_4i32: 1558; AVX512F: # %bb.0: # %entry 1559; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1560; AVX512F-NEXT: retq 1561; 1562; AVX512VL-LABEL: trunc2x2i64_4i32: 1563; AVX512VL: # %bb.0: # %entry 1564; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1565; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1566; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 1567; AVX512VL-NEXT: vzeroupper 1568; AVX512VL-NEXT: retq 1569; 1570; AVX512BW-LABEL: trunc2x2i64_4i32: 1571; AVX512BW: # %bb.0: # %entry 1572; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1573; AVX512BW-NEXT: retq 1574; 1575; AVX512BWVL-LABEL: trunc2x2i64_4i32: 1576; AVX512BWVL: # %bb.0: # %entry 1577; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1578; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1579; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 1580; AVX512BWVL-NEXT: vzeroupper 1581; AVX512BWVL-NEXT: retq 1582entry: 1583 %0 = trunc <2 x i64> %a to <2 x i32> 1584 %1 = trunc <2 x i64> %b to <2 x i32> 1585 %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 1586 ret <4 x i32> %2 1587} 1588 1589define i64 @trunc2i64_i64(<2 x i64> %inval) { 1590; SSE-LABEL: trunc2i64_i64: 1591; SSE: # %bb.0: # %entry 1592; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1593; SSE-NEXT: movq %xmm0, %rax 1594; SSE-NEXT: retq 1595; 1596; AVX-LABEL: trunc2i64_i64: 1597; AVX: # %bb.0: # %entry 1598; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1599; AVX-NEXT: vmovq %xmm0, %rax 1600; AVX-NEXT: retq 1601; 1602; AVX512-LABEL: trunc2i64_i64: 1603; AVX512: # %bb.0: # %entry 1604; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1605; AVX512-NEXT: vmovq %xmm0, %rax 1606; AVX512-NEXT: retq 1607entry: 1608 %0 = trunc <2 x i64> %inval to <2 x i32> 1609 %1 = bitcast <2 x i32> %0 to i64 1610 ret i64 %1 1611} 1612 1613define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { 1614; SSE2-LABEL: trunc2x4i32_8i16: 1615; SSE2: # %bb.0: # %entry 1616; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1617; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] 1618; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1619; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1620; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1621; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1622; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1623; SSE2-NEXT: retq 1624; 1625; SSSE3-LABEL: trunc2x4i32_8i16: 1626; SSSE3: # %bb.0: # %entry 1627; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 1628; SSSE3-NEXT: pshufb %xmm2, %xmm1 1629; SSSE3-NEXT: pshufb %xmm2, %xmm0 1630; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1631; SSSE3-NEXT: retq 1632; 1633; SSE41-LABEL: trunc2x4i32_8i16: 1634; SSE41: # %bb.0: # %entry 1635; SSE41-NEXT: pxor %xmm2, %xmm2 1636; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1637; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1638; SSE41-NEXT: packusdw %xmm1, %xmm0 1639; SSE41-NEXT: retq 1640; 1641; AVX-LABEL: trunc2x4i32_8i16: 1642; AVX: # %bb.0: # %entry 1643; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 1644; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] 1645; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] 1646; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1647; AVX-NEXT: retq 1648; 1649; AVX512F-LABEL: trunc2x4i32_8i16: 1650; AVX512F: # %bb.0: # %entry 1651; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1652; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1653; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1654; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1655; AVX512F-NEXT: vzeroupper 1656; AVX512F-NEXT: retq 1657; 1658; AVX512VL-LABEL: trunc2x4i32_8i16: 1659; AVX512VL: # %bb.0: # %entry 1660; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1661; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1662; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1663; AVX512VL-NEXT: vzeroupper 1664; AVX512VL-NEXT: retq 1665; 1666; AVX512BW-LABEL: trunc2x4i32_8i16: 1667; AVX512BW: # %bb.0: # %entry 1668; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1669; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1670; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1671; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1672; AVX512BW-NEXT: vzeroupper 1673; AVX512BW-NEXT: retq 1674; 1675; AVX512BWVL-LABEL: trunc2x4i32_8i16: 1676; AVX512BWVL: # %bb.0: # %entry 1677; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1678; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1679; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1680; AVX512BWVL-NEXT: vzeroupper 1681; AVX512BWVL-NEXT: retq 1682entry: 1683 %0 = trunc <4 x i32> %a to <4 x i16> 1684 %1 = trunc <4 x i32> %b to <4 x i16> 1685 %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1686 ret <8 x i16> %2 1687} 1688 1689; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1690define i64 @trunc4i32_i64(<4 x i32> %inval) { 1691; SSE2-LABEL: trunc4i32_i64: 1692; SSE2: # %bb.0: # %entry 1693; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1694; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 1695; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1696; SSE2-NEXT: movq %xmm0, %rax 1697; SSE2-NEXT: retq 1698; 1699; SSSE3-LABEL: trunc4i32_i64: 1700; SSSE3: # %bb.0: # %entry 1701; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1702; SSSE3-NEXT: movq %xmm0, %rax 1703; SSSE3-NEXT: retq 1704; 1705; SSE41-LABEL: trunc4i32_i64: 1706; SSE41: # %bb.0: # %entry 1707; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1708; SSE41-NEXT: movq %xmm0, %rax 1709; SSE41-NEXT: retq 1710; 1711; AVX-LABEL: trunc4i32_i64: 1712; AVX: # %bb.0: # %entry 1713; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1714; AVX-NEXT: vmovq %xmm0, %rax 1715; AVX-NEXT: retq 1716; 1717; AVX512F-LABEL: trunc4i32_i64: 1718; AVX512F: # %bb.0: # %entry 1719; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1720; AVX512F-NEXT: vmovq %xmm0, %rax 1721; AVX512F-NEXT: retq 1722; 1723; AVX512VL-LABEL: trunc4i32_i64: 1724; AVX512VL: # %bb.0: # %entry 1725; AVX512VL-NEXT: vpmovdw %xmm0, %xmm0 1726; AVX512VL-NEXT: vmovq %xmm0, %rax 1727; AVX512VL-NEXT: retq 1728; 1729; AVX512BW-LABEL: trunc4i32_i64: 1730; AVX512BW: # %bb.0: # %entry 1731; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 1732; AVX512BW-NEXT: vmovq %xmm0, %rax 1733; AVX512BW-NEXT: retq 1734; 1735; AVX512BWVL-LABEL: trunc4i32_i64: 1736; AVX512BWVL: # %bb.0: # %entry 1737; AVX512BWVL-NEXT: vpmovdw %xmm0, %xmm0 1738; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1739; AVX512BWVL-NEXT: retq 1740entry: 1741 %0 = trunc <4 x i32> %inval to <4 x i16> 1742 %1 = bitcast <4 x i16> %0 to i64 1743 ret i64 %1 1744} 1745 1746define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { 1747; SSE2-LABEL: trunc2x8i16_16i8: 1748; SSE2: # %bb.0: # %entry 1749; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1750; SSE2-NEXT: pand %xmm2, %xmm0 1751; SSE2-NEXT: pand %xmm2, %xmm1 1752; SSE2-NEXT: packuswb %xmm1, %xmm0 1753; SSE2-NEXT: retq 1754; 1755; SSSE3-LABEL: trunc2x8i16_16i8: 1756; SSSE3: # %bb.0: # %entry 1757; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1758; SSSE3-NEXT: pand %xmm2, %xmm1 1759; SSSE3-NEXT: pand %xmm2, %xmm0 1760; SSSE3-NEXT: packuswb %xmm1, %xmm0 1761; SSSE3-NEXT: retq 1762; 1763; SSE41-LABEL: trunc2x8i16_16i8: 1764; SSE41: # %bb.0: # %entry 1765; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1766; SSE41-NEXT: pand %xmm2, %xmm1 1767; SSE41-NEXT: pand %xmm2, %xmm0 1768; SSE41-NEXT: packuswb %xmm1, %xmm0 1769; SSE41-NEXT: retq 1770; 1771; AVX-LABEL: trunc2x8i16_16i8: 1772; AVX: # %bb.0: # %entry 1773; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1774; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 1775; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 1776; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1777; AVX-NEXT: retq 1778; 1779; AVX512F-LABEL: trunc2x8i16_16i8: 1780; AVX512F: # %bb.0: # %entry 1781; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1782; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 1783; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 1784; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1785; AVX512F-NEXT: retq 1786; 1787; AVX512VL-LABEL: trunc2x8i16_16i8: 1788; AVX512VL: # %bb.0: # %entry 1789; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 1790; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm1 1791; AVX512VL-NEXT: vpand %xmm2, %xmm0, %xmm0 1792; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1793; AVX512VL-NEXT: retq 1794; 1795; AVX512BW-LABEL: trunc2x8i16_16i8: 1796; AVX512BW: # %bb.0: # %entry 1797; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1798; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1799; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1800; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1801; AVX512BW-NEXT: vzeroupper 1802; AVX512BW-NEXT: retq 1803; 1804; AVX512BWVL-LABEL: trunc2x8i16_16i8: 1805; AVX512BWVL: # %bb.0: # %entry 1806; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 1807; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 1808; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 1809; AVX512BWVL-NEXT: vzeroupper 1810; AVX512BWVL-NEXT: retq 1811entry: 1812 %0 = trunc <8 x i16> %a to <8 x i8> 1813 %1 = trunc <8 x i16> %b to <8 x i8> 1814 %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1815 ret <16 x i8> %2 1816} 1817 1818; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 1819define i64 @trunc8i16_i64(<8 x i16> %inval) { 1820; SSE2-LABEL: trunc8i16_i64: 1821; SSE2: # %bb.0: # %entry 1822; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1823; SSE2-NEXT: packuswb %xmm0, %xmm0 1824; SSE2-NEXT: movq %xmm0, %rax 1825; SSE2-NEXT: retq 1826; 1827; SSSE3-LABEL: trunc8i16_i64: 1828; SSSE3: # %bb.0: # %entry 1829; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1830; SSSE3-NEXT: movq %xmm0, %rax 1831; SSSE3-NEXT: retq 1832; 1833; SSE41-LABEL: trunc8i16_i64: 1834; SSE41: # %bb.0: # %entry 1835; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1836; SSE41-NEXT: movq %xmm0, %rax 1837; SSE41-NEXT: retq 1838; 1839; AVX-LABEL: trunc8i16_i64: 1840; AVX: # %bb.0: # %entry 1841; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1842; AVX-NEXT: vmovq %xmm0, %rax 1843; AVX-NEXT: retq 1844; 1845; AVX512F-LABEL: trunc8i16_i64: 1846; AVX512F: # %bb.0: # %entry 1847; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1848; AVX512F-NEXT: vmovq %xmm0, %rax 1849; AVX512F-NEXT: retq 1850; 1851; AVX512VL-LABEL: trunc8i16_i64: 1852; AVX512VL: # %bb.0: # %entry 1853; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1854; AVX512VL-NEXT: vmovq %xmm0, %rax 1855; AVX512VL-NEXT: retq 1856; 1857; AVX512BW-LABEL: trunc8i16_i64: 1858; AVX512BW: # %bb.0: # %entry 1859; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] 1860; AVX512BW-NEXT: vmovq %xmm0, %rax 1861; AVX512BW-NEXT: retq 1862; 1863; AVX512BWVL-LABEL: trunc8i16_i64: 1864; AVX512BWVL: # %bb.0: # %entry 1865; AVX512BWVL-NEXT: vpmovwb %xmm0, %xmm0 1866; AVX512BWVL-NEXT: vmovq %xmm0, %rax 1867; AVX512BWVL-NEXT: retq 1868entry: 1869 %0 = trunc <8 x i16> %inval to <8 x i8> 1870 %1 = bitcast <8 x i8> %0 to i64 1871 ret i64 %1 1872} 1873 1874define <16 x i8> @trunc16i64_16i8_const() { 1875; SSE-LABEL: trunc16i64_16i8_const: 1876; SSE: # %bb.0: # %entry 1877; SSE-NEXT: xorps %xmm0, %xmm0 1878; SSE-NEXT: retq 1879; 1880; AVX-LABEL: trunc16i64_16i8_const: 1881; AVX: # %bb.0: # %entry 1882; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 1883; AVX-NEXT: retq 1884; 1885; AVX512-LABEL: trunc16i64_16i8_const: 1886; AVX512: # %bb.0: # %entry 1887; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 1888; AVX512-NEXT: retq 1889 1890entry: 1891 %0 = trunc <16 x i64> zeroinitializer to <16 x i8> 1892 %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26> 1893 ret <16 x i8> %1 1894} 1895 1896define <8 x i16> @PR32160(<8 x i32> %x) { 1897; SSE-LABEL: PR32160: 1898; SSE: # %bb.0: 1899; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] 1900; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] 1901; SSE-NEXT: retq 1902; 1903; AVX-LABEL: PR32160: 1904; AVX: # %bb.0: 1905; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] 1906; AVX-NEXT: vzeroupper 1907; AVX-NEXT: retq 1908; 1909; AVX512F-LABEL: PR32160: 1910; AVX512F: # %bb.0: 1911; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1912; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 1913; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] 1914; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 1915; AVX512F-NEXT: vzeroupper 1916; AVX512F-NEXT: retq 1917; 1918; AVX512VL-LABEL: PR32160: 1919; AVX512VL: # %bb.0: 1920; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 1921; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1922; AVX512VL-NEXT: vzeroupper 1923; AVX512VL-NEXT: retq 1924; 1925; AVX512BW-LABEL: PR32160: 1926; AVX512BW: # %bb.0: 1927; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1928; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 1929; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1930; AVX512BW-NEXT: vzeroupper 1931; AVX512BW-NEXT: retq 1932; 1933; AVX512BWVL-LABEL: PR32160: 1934; AVX512BWVL: # %bb.0: 1935; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 1936; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] 1937; AVX512BWVL-NEXT: vzeroupper 1938; AVX512BWVL-NEXT: retq 1939 %shuf = trunc <8 x i32> %x to <8 x i16> 1940 %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1941 ret <8 x i16> %trunc 1942} 1943 1944define void @PR34773(ptr %a0, ptr %a1) { 1945; SSE-LABEL: PR34773: 1946; SSE: # %bb.0: 1947; SSE-NEXT: movdqu (%rdi), %xmm0 1948; SSE-NEXT: movdqu 16(%rdi), %xmm1 1949; SSE-NEXT: movdqu 32(%rdi), %xmm2 1950; SSE-NEXT: movdqu 48(%rdi), %xmm3 1951; SSE-NEXT: psrlw $8, %xmm1 1952; SSE-NEXT: psrlw $8, %xmm0 1953; SSE-NEXT: packuswb %xmm1, %xmm0 1954; SSE-NEXT: psrlw $8, %xmm3 1955; SSE-NEXT: psrlw $8, %xmm2 1956; SSE-NEXT: packuswb %xmm3, %xmm2 1957; SSE-NEXT: movdqu %xmm0, (%rsi) 1958; SSE-NEXT: movdqu %xmm2, 16(%rsi) 1959; SSE-NEXT: retq 1960; 1961; AVX1-LABEL: PR34773: 1962; AVX1: # %bb.0: 1963; AVX1-NEXT: vmovdqu (%rdi), %xmm0 1964; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 1965; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 1966; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 1967; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 1968; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1969; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1970; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1971; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 1972; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 1973; AVX1-NEXT: vmovdqu %xmm0, (%rsi) 1974; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) 1975; AVX1-NEXT: retq 1976; 1977; AVX2-LABEL: PR34773: 1978; AVX2: # %bb.0: 1979; AVX2-NEXT: vmovdqu (%rdi), %ymm0 1980; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 1981; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1982; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 1983; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1984; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1985; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1986; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 1987; AVX2-NEXT: vmovdqu %xmm0, (%rsi) 1988; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) 1989; AVX2-NEXT: vzeroupper 1990; AVX2-NEXT: retq 1991; 1992; AVX512F-LABEL: PR34773: 1993; AVX512F: # %bb.0: 1994; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 1995; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 1996; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 1997; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 1998; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1999; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) 2000; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2001; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) 2002; AVX512F-NEXT: vzeroupper 2003; AVX512F-NEXT: retq 2004; 2005; AVX512VL-LABEL: PR34773: 2006; AVX512VL: # %bb.0: 2007; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 2008; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 2009; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 2010; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 2011; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2012; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) 2013; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 2014; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) 2015; AVX512VL-NEXT: vzeroupper 2016; AVX512VL-NEXT: retq 2017; 2018; AVX512BW-LABEL: PR34773: 2019; AVX512BW: # %bb.0: 2020; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 2021; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 2022; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 2023; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 2024; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2025; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 2026; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 2027; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) 2028; AVX512BW-NEXT: vzeroupper 2029; AVX512BW-NEXT: retq 2030; 2031; AVX512BWVL-LABEL: PR34773: 2032; AVX512BWVL: # %bb.0: 2033; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 2034; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 2035; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) 2036; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) 2037; AVX512BWVL-NEXT: vzeroupper 2038; AVX512BWVL-NEXT: retq 2039 %1 = getelementptr i16, ptr %a0, i64 16 2040 %2 = getelementptr i8, ptr %a1, i64 16 2041 %3 = load <16 x i16>, ptr %a0, align 2 2042 %4 = load <16 x i16>, ptr %1, align 2 2043 %5 = lshr <16 x i16> %3, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2044 %6 = lshr <16 x i16> %4, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 2045 %7 = trunc <16 x i16> %5 to <16 x i8> 2046 %8 = trunc <16 x i16> %6 to <16 x i8> 2047 store <16 x i8> %7, ptr %a1, align 1 2048 store <16 x i8> %8, ptr %2, align 1 2049 ret void 2050} 2051 2052; Store merging must not infinitely fight store splitting. 2053 2054define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, ptr %p) align 2 { 2055; SSE2-LABEL: store_merge_split: 2056; SSE2: # %bb.0: 2057; SSE2-NEXT: pslld $16, %xmm1 2058; SSE2-NEXT: psrad $16, %xmm1 2059; SSE2-NEXT: pslld $16, %xmm0 2060; SSE2-NEXT: psrad $16, %xmm0 2061; SSE2-NEXT: packssdw %xmm1, %xmm0 2062; SSE2-NEXT: pslld $16, %xmm3 2063; SSE2-NEXT: psrad $16, %xmm3 2064; SSE2-NEXT: pslld $16, %xmm2 2065; SSE2-NEXT: psrad $16, %xmm2 2066; SSE2-NEXT: packssdw %xmm3, %xmm2 2067; SSE2-NEXT: shlq $4, %rdi 2068; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) 2069; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2070; SSE2-NEXT: retq 2071; 2072; SSSE3-LABEL: store_merge_split: 2073; SSSE3: # %bb.0: 2074; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2075; SSSE3-NEXT: pshufb %xmm4, %xmm1 2076; SSSE3-NEXT: pshufb %xmm4, %xmm0 2077; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2078; SSSE3-NEXT: pshufb %xmm4, %xmm3 2079; SSSE3-NEXT: pshufb %xmm4, %xmm2 2080; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 2081; SSSE3-NEXT: shlq $4, %rdi 2082; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) 2083; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2084; SSSE3-NEXT: retq 2085; 2086; SSE41-LABEL: store_merge_split: 2087; SSE41: # %bb.0: 2088; SSE41-NEXT: pxor %xmm4, %xmm4 2089; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] 2090; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] 2091; SSE41-NEXT: packusdw %xmm1, %xmm0 2092; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] 2093; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 2094; SSE41-NEXT: packusdw %xmm3, %xmm2 2095; SSE41-NEXT: shlq $4, %rdi 2096; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) 2097; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) 2098; SSE41-NEXT: retq 2099; 2100; AVX1-LABEL: store_merge_split: 2101; AVX1: # %bb.0: 2102; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] 2103; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2104; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2105; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2106; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2107; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2108; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2109; AVX1-NEXT: shlq $4, %rdi 2110; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2111; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2112; AVX1-NEXT: vzeroupper 2113; AVX1-NEXT: retq 2114; 2115; AVX2-LABEL: store_merge_split: 2116; AVX2: # %bb.0: 2117; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] 2118; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 2119; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2120; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 2121; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2122; AVX2-NEXT: shlq $4, %rdi 2123; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2124; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2125; AVX2-NEXT: vzeroupper 2126; AVX2-NEXT: retq 2127; 2128; AVX512F-LABEL: store_merge_split: 2129; AVX512F: # %bb.0: 2130; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2131; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2132; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 2133; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 2134; AVX512F-NEXT: shlq $4, %rdi 2135; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2136; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2137; AVX512F-NEXT: vzeroupper 2138; AVX512F-NEXT: retq 2139; 2140; AVX512VL-LABEL: store_merge_split: 2141; AVX512VL: # %bb.0: 2142; AVX512VL-NEXT: shlq $4, %rdi 2143; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2144; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2145; AVX512VL-NEXT: vzeroupper 2146; AVX512VL-NEXT: retq 2147; 2148; AVX512BW-LABEL: store_merge_split: 2149; AVX512BW: # %bb.0: 2150; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2151; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2152; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 2153; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 2154; AVX512BW-NEXT: shlq $4, %rdi 2155; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) 2156; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) 2157; AVX512BW-NEXT: vzeroupper 2158; AVX512BW-NEXT: retq 2159; 2160; AVX512BWVL-LABEL: store_merge_split: 2161; AVX512BWVL: # %bb.0: 2162; AVX512BWVL-NEXT: shlq $4, %rdi 2163; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) 2164; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) 2165; AVX512BWVL-NEXT: vzeroupper 2166; AVX512BWVL-NEXT: retq 2167 %t1 = trunc <8 x i32> %w1 to <8 x i16> 2168 %t2 = trunc <8 x i32> %w2 to <8 x i16> 2169 %g1 = getelementptr inbounds <8 x i16>, ptr %p, i64 %idx 2170 %g2 = getelementptr inbounds <8 x i16>, ptr %g1, i64 1 2171 store <8 x i16> %t1, ptr %g1, align 2 2172 store <8 x i16> %t2, ptr %g2, align 2 2173 ret void 2174} 2175