1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BWNOVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512BWVL 10 11; 12; General cases - packing of vector comparison to legal vector result types 13; 14 15define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { 16; SSE2-LABEL: vselect_packss_v16i16: 17; SSE2: # %bb.0: 18; SSE2-NEXT: pcmpeqw %xmm3, %xmm1 19; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 20; SSE2-NEXT: packsswb %xmm1, %xmm0 21; SSE2-NEXT: pand %xmm0, %xmm4 22; SSE2-NEXT: pandn %xmm5, %xmm0 23; SSE2-NEXT: por %xmm4, %xmm0 24; SSE2-NEXT: retq 25; 26; SSE42-LABEL: vselect_packss_v16i16: 27; SSE42: # %bb.0: 28; SSE42-NEXT: pcmpeqw %xmm3, %xmm1 29; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 30; SSE42-NEXT: packsswb %xmm1, %xmm0 31; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5 32; SSE42-NEXT: movdqa %xmm5, %xmm0 33; SSE42-NEXT: retq 34; 35; AVX1-LABEL: vselect_packss_v16i16: 36; AVX1: # %bb.0: 37; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 38; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 39; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 40; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 41; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 42; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 43; AVX1-NEXT: vzeroupper 44; AVX1-NEXT: retq 45; 46; AVX2-LABEL: vselect_packss_v16i16: 47; AVX2: # %bb.0: 48; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 49; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 50; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 51; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 52; AVX2-NEXT: vzeroupper 53; AVX2-NEXT: retq 54; 55; AVX512F-LABEL: vselect_packss_v16i16: 56; AVX512F: # %bb.0: 57; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 58; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 59; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 60; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 61; AVX512F-NEXT: vzeroupper 62; AVX512F-NEXT: retq 63; 64; AVX512VL-LABEL: vselect_packss_v16i16: 65; AVX512VL: # %bb.0: 66; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 67; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 68; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 69; AVX512VL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 70; AVX512VL-NEXT: vzeroupper 71; AVX512VL-NEXT: retq 72; 73; AVX512BWNOVL-LABEL: vselect_packss_v16i16: 74; AVX512BWNOVL: # %bb.0: 75; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 76; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0 77; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 78; AVX512BWNOVL-NEXT: vzeroupper 79; AVX512BWNOVL-NEXT: retq 80; 81; AVX512BWVL-LABEL: vselect_packss_v16i16: 82; AVX512BWVL: # %bb.0: 83; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 84; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 85; AVX512BWVL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 86; AVX512BWVL-NEXT: vzeroupper 87; AVX512BWVL-NEXT: retq 88 %1 = icmp eq <16 x i16> %a0, %a1 89 %2 = sext <16 x i1> %1 to <16 x i8> 90 %3 = and <16 x i8> %2, %a2 91 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 92 %5 = and <16 x i8> %4, %a3 93 %6 = or <16 x i8> %3, %5 94 ret <16 x i8> %6 95} 96 97define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) { 98; SSE2-LABEL: vselect_packss_v16i32: 99; SSE2: # %bb.0: 100; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 101; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 102; SSE2-NEXT: packssdw %xmm3, %xmm2 103; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 104; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 105; SSE2-NEXT: packssdw %xmm1, %xmm0 106; SSE2-NEXT: packsswb %xmm2, %xmm0 107; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 108; SSE2-NEXT: pand %xmm0, %xmm1 109; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 110; SSE2-NEXT: por %xmm1, %xmm0 111; SSE2-NEXT: retq 112; 113; SSE42-LABEL: vselect_packss_v16i32: 114; SSE42: # %bb.0: 115; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 116; SSE42-NEXT: pcmpeqd %xmm7, %xmm3 117; SSE42-NEXT: pcmpeqd %xmm6, %xmm2 118; SSE42-NEXT: packssdw %xmm3, %xmm2 119; SSE42-NEXT: pcmpeqd %xmm5, %xmm1 120; SSE42-NEXT: pcmpeqd %xmm4, %xmm0 121; SSE42-NEXT: packssdw %xmm1, %xmm0 122; SSE42-NEXT: packsswb %xmm2, %xmm0 123; SSE42-NEXT: pblendvb %xmm0, {{[0-9]+}}(%rsp), %xmm8 124; SSE42-NEXT: movdqa %xmm8, %xmm0 125; SSE42-NEXT: retq 126; 127; AVX1-LABEL: vselect_packss_v16i32: 128; AVX1: # %bb.0: 129; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 130; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 131; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 132; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 133; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 134; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 135; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 136; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 137; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 138; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 139; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 140; AVX1-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 141; AVX1-NEXT: vzeroupper 142; AVX1-NEXT: retq 143; 144; AVX2-LABEL: vselect_packss_v16i32: 145; AVX2: # %bb.0: 146; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 147; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 148; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 149; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 150; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 151; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 152; AVX2-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 153; AVX2-NEXT: vzeroupper 154; AVX2-NEXT: retq 155; 156; AVX512F-LABEL: vselect_packss_v16i32: 157; AVX512F: # %bb.0: 158; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 159; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 160; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 161; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 162; AVX512F-NEXT: vzeroupper 163; AVX512F-NEXT: retq 164; 165; AVX512VL-LABEL: vselect_packss_v16i32: 166; AVX512VL: # %bb.0: 167; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 168; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 169; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 170; AVX512VL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 171; AVX512VL-NEXT: vzeroupper 172; AVX512VL-NEXT: retq 173; 174; AVX512BWNOVL-LABEL: vselect_packss_v16i32: 175; AVX512BWNOVL: # %bb.0: 176; AVX512BWNOVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 177; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0 178; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 179; AVX512BWNOVL-NEXT: vzeroupper 180; AVX512BWNOVL-NEXT: retq 181; 182; AVX512BWVL-LABEL: vselect_packss_v16i32: 183; AVX512BWVL: # %bb.0: 184; AVX512BWVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 185; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 186; AVX512BWVL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 187; AVX512BWVL-NEXT: vzeroupper 188; AVX512BWVL-NEXT: retq 189 %1 = icmp eq <16 x i32> %a0, %a1 190 %2 = sext <16 x i1> %1 to <16 x i8> 191 %3 = and <16 x i8> %2, %a2 192 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 193 %5 = and <16 x i8> %4, %a3 194 %6 = or <16 x i8> %3, %5 195 ret <16 x i8> %6 196} 197 198define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) { 199; SSE2-LABEL: vselect_packss_v16i64: 200; SSE2: # %bb.0: 201; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7 202; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2] 203; SSE2-NEXT: pand %xmm7, %xmm8 204; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6 205; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2] 206; SSE2-NEXT: pand %xmm6, %xmm7 207; SSE2-NEXT: packssdw %xmm8, %xmm7 208; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5 209; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] 210; SSE2-NEXT: pand %xmm5, %xmm6 211; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4 212; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] 213; SSE2-NEXT: pand %xmm4, %xmm5 214; SSE2-NEXT: packssdw %xmm6, %xmm5 215; SSE2-NEXT: packssdw %xmm7, %xmm5 216; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3 217; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] 218; SSE2-NEXT: pand %xmm3, %xmm4 219; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2 220; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] 221; SSE2-NEXT: pand %xmm2, %xmm3 222; SSE2-NEXT: packssdw %xmm4, %xmm3 223; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1 224; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] 225; SSE2-NEXT: pand %xmm1, %xmm2 226; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0 227; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 228; SSE2-NEXT: pand %xmm1, %xmm0 229; SSE2-NEXT: packssdw %xmm2, %xmm0 230; SSE2-NEXT: packssdw %xmm3, %xmm0 231; SSE2-NEXT: packsswb %xmm5, %xmm0 232; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 233; SSE2-NEXT: pand %xmm0, %xmm1 234; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 235; SSE2-NEXT: por %xmm1, %xmm0 236; SSE2-NEXT: retq 237; 238; SSE42-LABEL: vselect_packss_v16i64: 239; SSE42: # %bb.0: 240; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7 241; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6 242; SSE42-NEXT: packssdw %xmm7, %xmm6 243; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5 244; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4 245; SSE42-NEXT: packssdw %xmm5, %xmm4 246; SSE42-NEXT: packssdw %xmm6, %xmm4 247; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3 248; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2 249; SSE42-NEXT: packssdw %xmm3, %xmm2 250; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1 251; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0 252; SSE42-NEXT: packssdw %xmm1, %xmm0 253; SSE42-NEXT: packssdw %xmm2, %xmm0 254; SSE42-NEXT: packsswb %xmm4, %xmm0 255; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 256; SSE42-NEXT: pand %xmm0, %xmm1 257; SSE42-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 258; SSE42-NEXT: por %xmm1, %xmm0 259; SSE42-NEXT: retq 260; 261; AVX1-LABEL: vselect_packss_v16i64: 262; AVX1: # %bb.0: 263; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8 264; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm9 265; AVX1-NEXT: vpcmpeqq %xmm8, %xmm9, %xmm8 266; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 267; AVX1-NEXT: vpackssdw %xmm8, %xmm3, %xmm8 268; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 269; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 270; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 271; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2 272; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 273; AVX1-NEXT: vpackssdw %xmm8, %xmm2, %xmm2 274; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 275; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 276; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3 277; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 278; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 279; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 280; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 281; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 282; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 283; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 284; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 285; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 286; AVX1-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 287; AVX1-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 288; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 289; AVX1-NEXT: vzeroupper 290; AVX1-NEXT: retq 291; 292; AVX2-LABEL: vselect_packss_v16i64: 293; AVX2: # %bb.0: 294; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3 295; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2 296; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 297; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 298; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1 299; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 300; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 301; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 302; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 303; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 304; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 305; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 306; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 307; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 308; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 309; AVX2-NEXT: vzeroupper 310; AVX2-NEXT: retq 311; 312; AVX512F-LABEL: vselect_packss_v16i64: 313; AVX512F: # %bb.0: 314; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 315; AVX512F-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 316; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 317; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 318; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 319; AVX512F-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 320; AVX512F-NEXT: vzeroupper 321; AVX512F-NEXT: retq 322; 323; AVX512VL-LABEL: vselect_packss_v16i64: 324; AVX512VL: # %bb.0: 325; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 326; AVX512VL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 327; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 328; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 329; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 330; AVX512VL-NEXT: vpternlogq $202, %xmm5, %xmm4, %xmm0 331; AVX512VL-NEXT: vzeroupper 332; AVX512VL-NEXT: retq 333; 334; AVX512BWNOVL-LABEL: vselect_packss_v16i64: 335; AVX512BWNOVL: # %bb.0: 336; AVX512BWNOVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 337; AVX512BWNOVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 338; AVX512BWNOVL-NEXT: kunpckbw %k0, %k1, %k0 339; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0 340; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 341; AVX512BWNOVL-NEXT: vzeroupper 342; AVX512BWNOVL-NEXT: retq 343; 344; AVX512BWVL-LABEL: vselect_packss_v16i64: 345; AVX512BWVL: # %bb.0: 346; AVX512BWVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 347; AVX512BWVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 348; AVX512BWVL-NEXT: kunpckbw %k0, %k1, %k0 349; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 350; AVX512BWVL-NEXT: vpternlogq $202, %xmm5, %xmm4, %xmm0 351; AVX512BWVL-NEXT: vzeroupper 352; AVX512BWVL-NEXT: retq 353 %1 = icmp eq <16 x i64> %a0, %a1 354 %2 = sext <16 x i1> %1 to <16 x i8> 355 %3 = and <16 x i8> %2, %a2 356 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 357 %5 = and <16 x i8> %4, %a3 358 %6 = or <16 x i8> %3, %5 359 ret <16 x i8> %6 360} 361 362; 363; PACKSS case 364; 365 366define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { 367; SSE2-LABEL: vselect_packss: 368; SSE2: # %bb.0: 369; SSE2-NEXT: pcmpeqw %xmm3, %xmm1 370; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 371; SSE2-NEXT: packsswb %xmm1, %xmm0 372; SSE2-NEXT: pand %xmm0, %xmm4 373; SSE2-NEXT: pandn %xmm5, %xmm0 374; SSE2-NEXT: por %xmm4, %xmm0 375; SSE2-NEXT: retq 376; 377; SSE42-LABEL: vselect_packss: 378; SSE42: # %bb.0: 379; SSE42-NEXT: pcmpeqw %xmm3, %xmm1 380; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 381; SSE42-NEXT: packsswb %xmm1, %xmm0 382; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5 383; SSE42-NEXT: movdqa %xmm5, %xmm0 384; SSE42-NEXT: retq 385; 386; AVX1-LABEL: vselect_packss: 387; AVX1: # %bb.0: 388; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 389; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 390; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 391; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 392; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 393; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 394; AVX1-NEXT: vzeroupper 395; AVX1-NEXT: retq 396; 397; AVX2-LABEL: vselect_packss: 398; AVX2: # %bb.0: 399; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 400; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 401; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 402; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 403; AVX2-NEXT: vzeroupper 404; AVX2-NEXT: retq 405; 406; AVX512F-LABEL: vselect_packss: 407; AVX512F: # %bb.0: 408; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 409; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 410; AVX512F-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 411; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 412; AVX512F-NEXT: vzeroupper 413; AVX512F-NEXT: retq 414; 415; AVX512VL-LABEL: vselect_packss: 416; AVX512VL: # %bb.0: 417; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 418; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 419; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 420; AVX512VL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 421; AVX512VL-NEXT: vzeroupper 422; AVX512VL-NEXT: retq 423; 424; AVX512BWNOVL-LABEL: vselect_packss: 425; AVX512BWNOVL: # %bb.0: 426; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 427; AVX512BWNOVL-NEXT: vextracti128 $1, %ymm0, %xmm1 428; AVX512BWNOVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 429; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 430; AVX512BWNOVL-NEXT: vzeroupper 431; AVX512BWNOVL-NEXT: retq 432; 433; AVX512BWVL-LABEL: vselect_packss: 434; AVX512BWVL: # %bb.0: 435; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 436; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 437; AVX512BWVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 438; AVX512BWVL-NEXT: vpternlogq $202, %xmm3, %xmm2, %xmm0 439; AVX512BWVL-NEXT: vzeroupper 440; AVX512BWVL-NEXT: retq 441 %1 = icmp eq <16 x i16> %a0, %a1 442 %2 = sext <16 x i1> %1 to <16 x i16> 443 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 444 %4 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 445 %5 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4) 446 %6 = and <16 x i8> %5, %a2 447 %7 = xor <16 x i8> %5, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 448 %8 = and <16 x i8> %7, %a3 449 %9 = or <16 x i8> %6, %8 450 ret <16 x i8> %9 451} 452declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) 453