1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW 6 7; trunc(concat(x,y)) -> pack 8 9define <16 x i16> @trunc_concat_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 10; AVX1-LABEL: trunc_concat_packssdw_256: 11; AVX1: # %bb.0: 12; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 14; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 15; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 16; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 17; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 18; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 19; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 20; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 21; AVX1-NEXT: retq 22; 23; AVX2-LABEL: trunc_concat_packssdw_256: 24; AVX2: # %bb.0: 25; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 26; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 27; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 28; AVX2-NEXT: retq 29; 30; AVX512-LABEL: trunc_concat_packssdw_256: 31; AVX512: # %bb.0: 32; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 33; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 34; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 35; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 36; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 37; AVX512-NEXT: vpmovdw %zmm0, %ymm0 38; AVX512-NEXT: retq 39 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 40 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 41 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 42 %4 = trunc <16 x i32> %3 to <16 x i16> 43 ret <16 x i16> %4 44} 45 46define <16 x i16> @trunc_concat_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 47; AVX1-LABEL: trunc_concat_packusdw_256: 48; AVX1: # %bb.0: 49; AVX1-NEXT: vpsrld $17, %xmm0, %xmm2 50; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 51; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 52; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 53; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 54; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 55; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 56; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 57; AVX1-NEXT: retq 58; 59; AVX2-LABEL: trunc_concat_packusdw_256: 60; AVX2: # %bb.0: 61; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 62; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] 63; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 64; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 65; AVX2-NEXT: retq 66; 67; AVX512-LABEL: trunc_concat_packusdw_256: 68; AVX512: # %bb.0: 69; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 70; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1 71; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 72; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 73; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 74; AVX512-NEXT: vpmovdw %zmm0, %ymm0 75; AVX512-NEXT: retq 76 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 77 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 78 %3 = shufflevector <8 x i32> %1, <8 x i32> %2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 79 %4 = trunc <16 x i32> %3 to <16 x i16> 80 ret <16 x i16> %4 81} 82 83define <32 x i8> @trunc_concat_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 84; AVX1-LABEL: trunc_concat_packsswb_256: 85; AVX1: # %bb.0: 86; AVX1-NEXT: vpsraw $15, %xmm0, %xmm2 87; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 88; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 89; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 90; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 91; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 92; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1 93; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 94; AVX1-NEXT: retq 95; 96; AVX2-LABEL: trunc_concat_packsswb_256: 97; AVX2: # %bb.0: 98; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 99; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 100; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 101; AVX2-NEXT: retq 102; 103; AVX512F-LABEL: trunc_concat_packsswb_256: 104; AVX512F: # %bb.0: 105; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 106; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 107; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 108; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 109; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 110; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 111; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 112; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 113; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 114; AVX512F-NEXT: retq 115; 116; AVX512BW-LABEL: trunc_concat_packsswb_256: 117; AVX512BW: # %bb.0: 118; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 119; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 120; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 121; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 122; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 123; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 124; AVX512BW-NEXT: retq 125 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 126 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 127 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 128 %4 = trunc <32 x i16> %3 to <32 x i8> 129 ret <32 x i8> %4 130} 131 132define <32 x i8> @trunc_concat_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 133; AVX1-LABEL: trunc_concat_packuswb_256: 134; AVX1: # %bb.0: 135; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm2 136; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 137; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 138; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 139; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 140; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 141; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 142; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 143; AVX1-NEXT: retq 144; 145; AVX2-LABEL: trunc_concat_packuswb_256: 146; AVX2: # %bb.0: 147; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 148; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 149; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 150; AVX2-NEXT: retq 151; 152; AVX512F-LABEL: trunc_concat_packuswb_256: 153; AVX512F: # %bb.0: 154; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 155; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 156; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 157; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 158; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 159; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 160; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 161; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 162; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 163; AVX512F-NEXT: retq 164; 165; AVX512BW-LABEL: trunc_concat_packuswb_256: 166; AVX512BW: # %bb.0: 167; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 168; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 169; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 170; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 171; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 172; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 173; AVX512BW-NEXT: retq 174 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 175 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 176 %3 = shufflevector <16 x i16> %1, <16 x i16> %2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 177 %4 = trunc <32 x i16> %3 to <32 x i8> 178 ret <32 x i8> %4 179} 180 181; concat(trunc(x),trunc(y)) -> pack 182 183 184define <16 x i16> @concat_trunc_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 185; AVX1-LABEL: concat_trunc_packssdw_256: 186; AVX1: # %bb.0: 187; AVX1-NEXT: vpsrad $17, %xmm0, %xmm2 188; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 189; AVX1-NEXT: vpsrad $17, %xmm0, %xmm0 190; AVX1-NEXT: vpsrad $23, %xmm1, %xmm3 191; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 192; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 193; AVX1-NEXT: vpsrad $23, %xmm1, %xmm1 194; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 195; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 196; AVX1-NEXT: retq 197; 198; AVX2-LABEL: concat_trunc_packssdw_256: 199; AVX2: # %bb.0: 200; AVX2-NEXT: vpsrad $17, %ymm0, %ymm0 201; AVX2-NEXT: vpsrad $23, %ymm1, %ymm1 202; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 203; AVX2-NEXT: retq 204; 205; AVX512-LABEL: concat_trunc_packssdw_256: 206; AVX512: # %bb.0: 207; AVX512-NEXT: vpsrad $17, %ymm0, %ymm0 208; AVX512-NEXT: vpsrad $23, %ymm1, %ymm1 209; AVX512-NEXT: vpmovdw %ymm0, %xmm0 210; AVX512-NEXT: vpmovdw %ymm1, %xmm1 211; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 212; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 213; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 214; AVX512-NEXT: retq 215 %1 = ashr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 216 %2 = ashr <8 x i32> %a1, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23> 217 %3 = trunc <8 x i32> %1 to <8 x i16> 218 %4 = trunc <8 x i32> %2 to <8 x i16> 219 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 220 ret <16 x i16> %5 221} 222 223define <16 x i16> @concat_trunc_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) nounwind { 224; AVX1-LABEL: concat_trunc_packusdw_256: 225; AVX1: # %bb.0: 226; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 227; AVX1-NEXT: vpsrld $17, %xmm2, %xmm2 228; AVX1-NEXT: vpsrld $17, %xmm0, %xmm0 229; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 230; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 231; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 232; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 233; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 234; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 235; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 236; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 237; AVX1-NEXT: retq 238; 239; AVX2-LABEL: concat_trunc_packusdw_256: 240; AVX2: # %bb.0: 241; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 242; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 243; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 244; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 245; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 246; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 247; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 248; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 249; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 250; AVX2-NEXT: retq 251; 252; AVX512-LABEL: concat_trunc_packusdw_256: 253; AVX512: # %bb.0: 254; AVX512-NEXT: vpsrld $17, %ymm0, %ymm0 255; AVX512-NEXT: vpmovdw %ymm0, %xmm0 256; AVX512-NEXT: vpmovdw %ymm1, %xmm1 257; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 258; AVX512-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 259; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 260; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 261; AVX512-NEXT: retq 262 %1 = lshr <8 x i32> %a0, <i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17, i32 17> 263 %2 = and <8 x i32> %a1, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 264 %3 = trunc <8 x i32> %1 to <8 x i16> 265 %4 = trunc <8 x i32> %2 to <8 x i16> 266 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15> 267 ret <16 x i16> %5 268} 269 270define <32 x i8> @concat_trunc_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 271; AVX1-LABEL: concat_trunc_packsswb_256: 272; AVX1: # %bb.0: 273; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 274; AVX1-NEXT: vpsraw $15, %xmm2, %xmm2 275; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 276; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 277; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 278; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 279; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 280; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 281; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 282; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 283; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 284; AVX1-NEXT: retq 285; 286; AVX2-LABEL: concat_trunc_packsswb_256: 287; AVX2: # %bb.0: 288; AVX2-NEXT: vpsraw $15, %ymm0, %ymm0 289; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 290; AVX2-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 291; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 292; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 293; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 294; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 295; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 296; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 297; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 298; AVX2-NEXT: retq 299; 300; AVX512F-LABEL: concat_trunc_packsswb_256: 301; AVX512F: # %bb.0: 302; AVX512F-NEXT: vpsraw $15, %ymm0, %ymm0 303; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 304; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 305; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 306; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 307; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 308; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 309; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 310; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 311; AVX512F-NEXT: retq 312; 313; AVX512BW-LABEL: concat_trunc_packsswb_256: 314; AVX512BW: # %bb.0: 315; AVX512BW-NEXT: vpsraw $15, %ymm0, %ymm0 316; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 317; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 318; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 319; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 320; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 321; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 322; AVX512BW-NEXT: retq 323 %1 = ashr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 324 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 325 %3 = trunc <16 x i16> %1 to <16 x i8> 326 %4 = trunc <16 x i16> %2 to <16 x i8> 327 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 328 ret <32 x i8> %5 329} 330 331define <32 x i8> @concat_trunc_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) nounwind { 332; AVX1-LABEL: concat_trunc_packuswb_256: 333; AVX1: # %bb.0: 334; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 335; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2 336; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0 337; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 338; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 339; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 340; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 341; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 342; AVX1-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 343; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 344; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 345; AVX1-NEXT: retq 346; 347; AVX2-LABEL: concat_trunc_packuswb_256: 348; AVX2: # %bb.0: 349; AVX2-NEXT: vpsrlw $15, %ymm0, %ymm0 350; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 351; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 352; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 353; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 354; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 355; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 356; AVX2-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 357; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 358; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 359; AVX2-NEXT: retq 360; 361; AVX512F-LABEL: concat_trunc_packuswb_256: 362; AVX512F: # %bb.0: 363; AVX512F-NEXT: vpsrlw $15, %ymm0, %ymm0 364; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 365; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 366; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 367; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 368; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 369; AVX512F-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 370; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 371; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 372; AVX512F-NEXT: retq 373; 374; AVX512BW-LABEL: concat_trunc_packuswb_256: 375; AVX512BW: # %bb.0: 376; AVX512BW-NEXT: vpsrlw $15, %ymm0, %ymm0 377; AVX512BW-NEXT: vpmovwb %ymm0, %xmm0 378; AVX512BW-NEXT: vpmovwb %ymm1, %xmm1 379; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 380; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm0[1],xmm1[1] 381; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 382; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 383; AVX512BW-NEXT: retq 384 %1 = lshr <16 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 385 %2 = and <16 x i16> %a1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 386 %3 = trunc <16 x i16> %1 to <16 x i8> 387 %4 = trunc <16 x i16> %2 to <16 x i8> 388 %5 = shufflevector <16 x i8> %3, <16 x i8> %4, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 389 ret <32 x i8> %5 390} 391