1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-ALL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL 12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW 14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL 16; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefixes=SKX 17 18; 19; PACKUS saturation truncation to vXi32 20; 21 22define <2 x i32> @trunc_packus_v2i64_v2i32(<2 x i64> %a0) { 23; SSE2-LABEL: trunc_packus_v2i64_v2i32: 24; SSE2: # %bb.0: 25; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 26; SSE2-NEXT: movdqa %xmm0, %xmm2 27; SSE2-NEXT: pxor %xmm1, %xmm2 28; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 29; SSE2-NEXT: pxor %xmm4, %xmm4 30; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 31; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 32; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 33; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 34; SSE2-NEXT: pand %xmm4, %xmm2 35; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 36; SSE2-NEXT: por %xmm2, %xmm3 37; SSE2-NEXT: pand %xmm3, %xmm0 38; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 39; SSE2-NEXT: por %xmm0, %xmm3 40; SSE2-NEXT: movdqa %xmm3, %xmm0 41; SSE2-NEXT: pxor %xmm1, %xmm0 42; SSE2-NEXT: movdqa %xmm0, %xmm2 43; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 44; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 45; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 46; SSE2-NEXT: pand %xmm2, %xmm0 47; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 48; SSE2-NEXT: por %xmm0, %xmm1 49; SSE2-NEXT: pand %xmm3, %xmm1 50; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 51; SSE2-NEXT: retq 52; 53; SSSE3-LABEL: trunc_packus_v2i64_v2i32: 54; SSSE3: # %bb.0: 55; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 56; SSSE3-NEXT: movdqa %xmm0, %xmm2 57; SSSE3-NEXT: pxor %xmm1, %xmm2 58; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 59; SSSE3-NEXT: pxor %xmm4, %xmm4 60; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 61; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 62; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 63; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 64; SSSE3-NEXT: pand %xmm4, %xmm2 65; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 66; SSSE3-NEXT: por %xmm2, %xmm3 67; SSSE3-NEXT: pand %xmm3, %xmm0 68; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 69; SSSE3-NEXT: por %xmm0, %xmm3 70; SSSE3-NEXT: movdqa %xmm3, %xmm0 71; SSSE3-NEXT: pxor %xmm1, %xmm0 72; SSSE3-NEXT: movdqa %xmm0, %xmm2 73; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 74; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 75; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 76; SSSE3-NEXT: pand %xmm2, %xmm0 77; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 78; SSSE3-NEXT: por %xmm0, %xmm1 79; SSSE3-NEXT: pand %xmm3, %xmm1 80; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 81; SSSE3-NEXT: retq 82; 83; SSE41-LABEL: trunc_packus_v2i64_v2i32: 84; SSE41: # %bb.0: 85; SSE41-NEXT: movdqa %xmm0, %xmm1 86; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] 87; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 88; SSE41-NEXT: pxor %xmm3, %xmm0 89; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] 90; SSE41-NEXT: movdqa %xmm4, %xmm5 91; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 92; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 93; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 94; SSE41-NEXT: pand %xmm5, %xmm0 95; SSE41-NEXT: por %xmm4, %xmm0 96; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 97; SSE41-NEXT: xorpd %xmm1, %xmm1 98; SSE41-NEXT: movapd %xmm2, %xmm4 99; SSE41-NEXT: xorpd %xmm3, %xmm4 100; SSE41-NEXT: movapd %xmm4, %xmm5 101; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 102; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 103; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 104; SSE41-NEXT: pand %xmm5, %xmm0 105; SSE41-NEXT: por %xmm4, %xmm0 106; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 107; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 108; SSE41-NEXT: retq 109; 110; AVX-LABEL: trunc_packus_v2i64_v2i32: 111; AVX: # %bb.0: 112; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] 113; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 114; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 115; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 116; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 117; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 118; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 119; AVX-NEXT: retq 120; 121; AVX512F-LABEL: trunc_packus_v2i64_v2i32: 122; AVX512F: # %bb.0: 123; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 124; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 125; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 126; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 127; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 128; AVX512F-NEXT: vzeroupper 129; AVX512F-NEXT: retq 130; 131; AVX512VL-LABEL: trunc_packus_v2i64_v2i32: 132; AVX512VL: # %bb.0: 133; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 134; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 135; AVX512VL-NEXT: vpmovusqd %xmm0, %xmm0 136; AVX512VL-NEXT: retq 137; 138; AVX512BW-LABEL: trunc_packus_v2i64_v2i32: 139; AVX512BW: # %bb.0: 140; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 141; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 142; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 143; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 144; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 145; AVX512BW-NEXT: vzeroupper 146; AVX512BW-NEXT: retq 147; 148; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32: 149; AVX512BWVL: # %bb.0: 150; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 151; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 152; AVX512BWVL-NEXT: vpmovusqd %xmm0, %xmm0 153; AVX512BWVL-NEXT: retq 154; 155; SKX-LABEL: trunc_packus_v2i64_v2i32: 156; SKX: # %bb.0: 157; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 158; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 159; SKX-NEXT: vpmovusqd %xmm0, %xmm0 160; SKX-NEXT: retq 161 %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295> 162 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295> 163 %3 = icmp sgt <2 x i64> %2, zeroinitializer 164 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 165 %5 = trunc <2 x i64> %4 to <2 x i32> 166 ret <2 x i32> %5 167} 168 169define void @trunc_packus_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) { 170; SSE2-LABEL: trunc_packus_v2i64_v2i32_store: 171; SSE2: # %bb.0: 172; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 173; SSE2-NEXT: movdqa %xmm0, %xmm2 174; SSE2-NEXT: pxor %xmm1, %xmm2 175; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 176; SSE2-NEXT: pxor %xmm4, %xmm4 177; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 178; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 179; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 180; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 181; SSE2-NEXT: pand %xmm4, %xmm2 182; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 183; SSE2-NEXT: por %xmm2, %xmm3 184; SSE2-NEXT: pand %xmm3, %xmm0 185; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 186; SSE2-NEXT: por %xmm0, %xmm3 187; SSE2-NEXT: movdqa %xmm3, %xmm0 188; SSE2-NEXT: pxor %xmm1, %xmm0 189; SSE2-NEXT: movdqa %xmm0, %xmm2 190; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 191; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 192; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 193; SSE2-NEXT: pand %xmm2, %xmm0 194; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 195; SSE2-NEXT: por %xmm0, %xmm1 196; SSE2-NEXT: pand %xmm3, %xmm1 197; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 198; SSE2-NEXT: movq %xmm0, (%rdi) 199; SSE2-NEXT: retq 200; 201; SSSE3-LABEL: trunc_packus_v2i64_v2i32_store: 202; SSSE3: # %bb.0: 203; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 204; SSSE3-NEXT: movdqa %xmm0, %xmm2 205; SSSE3-NEXT: pxor %xmm1, %xmm2 206; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 207; SSSE3-NEXT: pxor %xmm4, %xmm4 208; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 209; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 210; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 211; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 212; SSSE3-NEXT: pand %xmm4, %xmm2 213; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 214; SSSE3-NEXT: por %xmm2, %xmm3 215; SSSE3-NEXT: pand %xmm3, %xmm0 216; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 217; SSSE3-NEXT: por %xmm0, %xmm3 218; SSSE3-NEXT: movdqa %xmm3, %xmm0 219; SSSE3-NEXT: pxor %xmm1, %xmm0 220; SSSE3-NEXT: movdqa %xmm0, %xmm2 221; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 222; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 223; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 224; SSSE3-NEXT: pand %xmm2, %xmm0 225; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 226; SSSE3-NEXT: por %xmm0, %xmm1 227; SSSE3-NEXT: pand %xmm3, %xmm1 228; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 229; SSSE3-NEXT: movq %xmm0, (%rdi) 230; SSSE3-NEXT: retq 231; 232; SSE41-LABEL: trunc_packus_v2i64_v2i32_store: 233; SSE41: # %bb.0: 234; SSE41-NEXT: movdqa %xmm0, %xmm1 235; SSE41-NEXT: movapd {{.*#+}} xmm2 = [4294967295,4294967295] 236; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 237; SSE41-NEXT: pxor %xmm3, %xmm0 238; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483647,2147483647] 239; SSE41-NEXT: movdqa %xmm4, %xmm5 240; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 241; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 242; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 243; SSE41-NEXT: pand %xmm5, %xmm0 244; SSE41-NEXT: por %xmm4, %xmm0 245; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 246; SSE41-NEXT: xorpd %xmm1, %xmm1 247; SSE41-NEXT: movapd %xmm2, %xmm4 248; SSE41-NEXT: xorpd %xmm3, %xmm4 249; SSE41-NEXT: movapd %xmm4, %xmm5 250; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 251; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 252; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 253; SSE41-NEXT: pand %xmm5, %xmm0 254; SSE41-NEXT: por %xmm4, %xmm0 255; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 256; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 257; SSE41-NEXT: movq %xmm0, (%rdi) 258; SSE41-NEXT: retq 259; 260; AVX-LABEL: trunc_packus_v2i64_v2i32_store: 261; AVX: # %bb.0: 262; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] 263; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 264; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 265; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 266; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 267; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 268; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 269; AVX-NEXT: vmovq %xmm0, (%rdi) 270; AVX-NEXT: retq 271; 272; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: 273; AVX512F: # %bb.0: 274; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 275; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 276; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 277; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 278; AVX512F-NEXT: vmovq %xmm0, (%rdi) 279; AVX512F-NEXT: vzeroupper 280; AVX512F-NEXT: retq 281; 282; AVX512VL-LABEL: trunc_packus_v2i64_v2i32_store: 283; AVX512VL: # %bb.0: 284; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 285; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 286; AVX512VL-NEXT: vpmovusqd %xmm0, (%rdi) 287; AVX512VL-NEXT: retq 288; 289; AVX512BW-LABEL: trunc_packus_v2i64_v2i32_store: 290; AVX512BW: # %bb.0: 291; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 292; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 293; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 294; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 295; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 296; AVX512BW-NEXT: vzeroupper 297; AVX512BW-NEXT: retq 298; 299; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i32_store: 300; AVX512BWVL: # %bb.0: 301; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 302; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 303; AVX512BWVL-NEXT: vpmovusqd %xmm0, (%rdi) 304; AVX512BWVL-NEXT: retq 305; 306; SKX-LABEL: trunc_packus_v2i64_v2i32_store: 307; SKX: # %bb.0: 308; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 309; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 310; SKX-NEXT: vpmovusqd %xmm0, (%rdi) 311; SKX-NEXT: retq 312 %1 = icmp slt <2 x i64> %a0, <i64 4294967295, i64 4294967295> 313 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 4294967295, i64 4294967295> 314 %3 = icmp sgt <2 x i64> %2, zeroinitializer 315 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 316 %5 = trunc <2 x i64> %4 to <2 x i32> 317 store <2 x i32> %5, ptr %p1 318 ret void 319} 320 321define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { 322; SSE2-LABEL: trunc_packus_v4i64_v4i32: 323; SSE2: # %bb.0: 324; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] 325; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 326; SSE2-NEXT: movdqa %xmm0, %xmm4 327; SSE2-NEXT: pxor %xmm2, %xmm4 328; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 329; SSE2-NEXT: pxor %xmm6, %xmm6 330; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 331; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 332; SSE2-NEXT: movdqa %xmm3, %xmm7 333; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 334; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] 335; SSE2-NEXT: pand %xmm5, %xmm4 336; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 337; SSE2-NEXT: por %xmm4, %xmm5 338; SSE2-NEXT: pand %xmm5, %xmm0 339; SSE2-NEXT: pandn %xmm8, %xmm5 340; SSE2-NEXT: por %xmm5, %xmm0 341; SSE2-NEXT: movdqa %xmm1, %xmm4 342; SSE2-NEXT: pxor %xmm2, %xmm4 343; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 344; SSE2-NEXT: pcmpeqd %xmm6, %xmm5 345; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 346; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 347; SSE2-NEXT: pand %xmm5, %xmm4 348; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 349; SSE2-NEXT: por %xmm4, %xmm3 350; SSE2-NEXT: pand %xmm3, %xmm1 351; SSE2-NEXT: pandn %xmm8, %xmm3 352; SSE2-NEXT: por %xmm1, %xmm3 353; SSE2-NEXT: movdqa %xmm3, %xmm1 354; SSE2-NEXT: pxor %xmm2, %xmm1 355; SSE2-NEXT: movdqa %xmm1, %xmm4 356; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 357; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 358; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 359; SSE2-NEXT: pand %xmm4, %xmm1 360; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 361; SSE2-NEXT: por %xmm1, %xmm4 362; SSE2-NEXT: pand %xmm3, %xmm4 363; SSE2-NEXT: movdqa %xmm0, %xmm1 364; SSE2-NEXT: pxor %xmm2, %xmm1 365; SSE2-NEXT: movdqa %xmm1, %xmm3 366; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 367; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 368; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 369; SSE2-NEXT: pand %xmm3, %xmm1 370; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 371; SSE2-NEXT: por %xmm1, %xmm2 372; SSE2-NEXT: pand %xmm2, %xmm0 373; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 374; SSE2-NEXT: retq 375; 376; SSSE3-LABEL: trunc_packus_v4i64_v4i32: 377; SSSE3: # %bb.0: 378; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] 379; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 380; SSSE3-NEXT: movdqa %xmm0, %xmm4 381; SSSE3-NEXT: pxor %xmm2, %xmm4 382; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 383; SSSE3-NEXT: pxor %xmm6, %xmm6 384; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 385; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483647,2147483647] 386; SSSE3-NEXT: movdqa %xmm3, %xmm7 387; SSSE3-NEXT: pcmpgtd %xmm4, %xmm7 388; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2] 389; SSSE3-NEXT: pand %xmm5, %xmm4 390; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 391; SSSE3-NEXT: por %xmm4, %xmm5 392; SSSE3-NEXT: pand %xmm5, %xmm0 393; SSSE3-NEXT: pandn %xmm8, %xmm5 394; SSSE3-NEXT: por %xmm5, %xmm0 395; SSSE3-NEXT: movdqa %xmm1, %xmm4 396; SSSE3-NEXT: pxor %xmm2, %xmm4 397; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 398; SSSE3-NEXT: pcmpeqd %xmm6, %xmm5 399; SSSE3-NEXT: pcmpgtd %xmm4, %xmm3 400; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 401; SSSE3-NEXT: pand %xmm5, %xmm4 402; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 403; SSSE3-NEXT: por %xmm4, %xmm3 404; SSSE3-NEXT: pand %xmm3, %xmm1 405; SSSE3-NEXT: pandn %xmm8, %xmm3 406; SSSE3-NEXT: por %xmm1, %xmm3 407; SSSE3-NEXT: movdqa %xmm3, %xmm1 408; SSSE3-NEXT: pxor %xmm2, %xmm1 409; SSSE3-NEXT: movdqa %xmm1, %xmm4 410; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 411; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 412; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 413; SSSE3-NEXT: pand %xmm4, %xmm1 414; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 415; SSSE3-NEXT: por %xmm1, %xmm4 416; SSSE3-NEXT: pand %xmm3, %xmm4 417; SSSE3-NEXT: movdqa %xmm0, %xmm1 418; SSSE3-NEXT: pxor %xmm2, %xmm1 419; SSSE3-NEXT: movdqa %xmm1, %xmm3 420; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 421; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 422; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 423; SSSE3-NEXT: pand %xmm3, %xmm1 424; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 425; SSSE3-NEXT: por %xmm1, %xmm2 426; SSSE3-NEXT: pand %xmm2, %xmm0 427; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 428; SSSE3-NEXT: retq 429; 430; SSE41-LABEL: trunc_packus_v4i64_v4i32: 431; SSE41: # %bb.0: 432; SSE41-NEXT: movdqa %xmm0, %xmm2 433; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] 434; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 435; SSE41-NEXT: pxor %xmm3, %xmm0 436; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] 437; SSE41-NEXT: movdqa %xmm6, %xmm5 438; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 439; SSE41-NEXT: movdqa %xmm6, %xmm7 440; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 441; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 442; SSE41-NEXT: pand %xmm5, %xmm0 443; SSE41-NEXT: por %xmm7, %xmm0 444; SSE41-NEXT: movapd %xmm4, %xmm5 445; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 446; SSE41-NEXT: movdqa %xmm1, %xmm0 447; SSE41-NEXT: pxor %xmm3, %xmm0 448; SSE41-NEXT: movdqa %xmm6, %xmm2 449; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 450; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 451; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 452; SSE41-NEXT: pand %xmm2, %xmm0 453; SSE41-NEXT: por %xmm6, %xmm0 454; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 455; SSE41-NEXT: xorpd %xmm1, %xmm1 456; SSE41-NEXT: movapd %xmm4, %xmm2 457; SSE41-NEXT: xorpd %xmm3, %xmm2 458; SSE41-NEXT: movapd %xmm2, %xmm6 459; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 460; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 461; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 462; SSE41-NEXT: pand %xmm6, %xmm0 463; SSE41-NEXT: por %xmm2, %xmm0 464; SSE41-NEXT: pxor %xmm2, %xmm2 465; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 466; SSE41-NEXT: movapd %xmm5, %xmm4 467; SSE41-NEXT: xorpd %xmm3, %xmm4 468; SSE41-NEXT: movapd %xmm4, %xmm6 469; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 470; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 471; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 472; SSE41-NEXT: pand %xmm6, %xmm0 473; SSE41-NEXT: por %xmm4, %xmm0 474; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 475; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 476; SSE41-NEXT: movaps %xmm1, %xmm0 477; SSE41-NEXT: retq 478; 479; AVX1-LABEL: trunc_packus_v4i64_v4i32: 480; AVX1: # %bb.0: 481; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] 482; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 483; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 484; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 485; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 486; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 487; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 488; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 489; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 490; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 491; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 492; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2] 493; AVX1-NEXT: vzeroupper 494; AVX1-NEXT: retq 495; 496; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: 497; AVX2-SLOW: # %bb.0: 498; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 499; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 500; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 501; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 502; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 503; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 504; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 505; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 506; AVX2-SLOW-NEXT: vzeroupper 507; AVX2-SLOW-NEXT: retq 508; 509; AVX2-FAST-ALL-LABEL: trunc_packus_v4i64_v4i32: 510; AVX2-FAST-ALL: # %bb.0: 511; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 512; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 513; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 514; AVX2-FAST-ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 515; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 516; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0 517; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 518; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 519; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 520; AVX2-FAST-ALL-NEXT: vzeroupper 521; AVX2-FAST-ALL-NEXT: retq 522; 523; AVX2-FAST-PERLANE-LABEL: trunc_packus_v4i64_v4i32: 524; AVX2-FAST-PERLANE: # %bb.0: 525; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] 526; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 527; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 528; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 529; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 530; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm1, %ymm0 531; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 532; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 533; AVX2-FAST-PERLANE-NEXT: vzeroupper 534; AVX2-FAST-PERLANE-NEXT: retq 535; 536; AVX512F-LABEL: trunc_packus_v4i64_v4i32: 537; AVX512F: # %bb.0: 538; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 539; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 540; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 541; AVX512F-NEXT: vpmovusqd %zmm0, %ymm0 542; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 543; AVX512F-NEXT: vzeroupper 544; AVX512F-NEXT: retq 545; 546; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: 547; AVX512VL: # %bb.0: 548; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 549; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 550; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 551; AVX512VL-NEXT: vzeroupper 552; AVX512VL-NEXT: retq 553; 554; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: 555; AVX512BW: # %bb.0: 556; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 557; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 558; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 559; AVX512BW-NEXT: vpmovusqd %zmm0, %ymm0 560; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 561; AVX512BW-NEXT: vzeroupper 562; AVX512BW-NEXT: retq 563; 564; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: 565; AVX512BWVL: # %bb.0: 566; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 567; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 568; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 569; AVX512BWVL-NEXT: vzeroupper 570; AVX512BWVL-NEXT: retq 571; 572; SKX-LABEL: trunc_packus_v4i64_v4i32: 573; SKX: # %bb.0: 574; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 575; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 576; SKX-NEXT: vpmovusqd %ymm0, %xmm0 577; SKX-NEXT: vzeroupper 578; SKX-NEXT: retq 579 %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 580 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 581 %3 = icmp sgt <4 x i64> %2, zeroinitializer 582 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 583 %5 = trunc <4 x i64> %4 to <4 x i32> 584 ret <4 x i32> %5 585} 586 587 588define <8 x i32> @trunc_packus_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256" { 589; SSE2-LABEL: trunc_packus_v8i64_v8i32: 590; SSE2: # %bb.0: 591; SSE2-NEXT: movdqa (%rdi), %xmm3 592; SSE2-NEXT: movdqa 16(%rdi), %xmm4 593; SSE2-NEXT: movdqa 32(%rdi), %xmm6 594; SSE2-NEXT: movdqa 48(%rdi), %xmm10 595; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] 596; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 597; SSE2-NEXT: movdqa %xmm3, %xmm2 598; SSE2-NEXT: pxor %xmm11, %xmm2 599; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 600; SSE2-NEXT: pxor %xmm9, %xmm9 601; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 602; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647] 603; SSE2-NEXT: movdqa %xmm1, %xmm5 604; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 605; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 606; SSE2-NEXT: pand %xmm7, %xmm0 607; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 608; SSE2-NEXT: por %xmm0, %xmm2 609; SSE2-NEXT: pand %xmm2, %xmm3 610; SSE2-NEXT: pandn %xmm8, %xmm2 611; SSE2-NEXT: por %xmm3, %xmm2 612; SSE2-NEXT: movdqa %xmm4, %xmm0 613; SSE2-NEXT: pxor %xmm11, %xmm0 614; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 615; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 616; SSE2-NEXT: movdqa %xmm1, %xmm5 617; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 618; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 619; SSE2-NEXT: pand %xmm3, %xmm0 620; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 621; SSE2-NEXT: por %xmm0, %xmm3 622; SSE2-NEXT: pand %xmm3, %xmm4 623; SSE2-NEXT: pandn %xmm8, %xmm3 624; SSE2-NEXT: por %xmm4, %xmm3 625; SSE2-NEXT: movdqa %xmm6, %xmm0 626; SSE2-NEXT: pxor %xmm11, %xmm0 627; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 628; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 629; SSE2-NEXT: movdqa %xmm1, %xmm5 630; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 631; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 632; SSE2-NEXT: pand %xmm4, %xmm0 633; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] 634; SSE2-NEXT: por %xmm0, %xmm4 635; SSE2-NEXT: pand %xmm4, %xmm6 636; SSE2-NEXT: pandn %xmm8, %xmm4 637; SSE2-NEXT: por %xmm6, %xmm4 638; SSE2-NEXT: movdqa %xmm10, %xmm0 639; SSE2-NEXT: pxor %xmm11, %xmm0 640; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 641; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 642; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 643; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 644; SSE2-NEXT: pand %xmm5, %xmm0 645; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 646; SSE2-NEXT: por %xmm0, %xmm1 647; SSE2-NEXT: pand %xmm1, %xmm10 648; SSE2-NEXT: pandn %xmm8, %xmm1 649; SSE2-NEXT: por %xmm10, %xmm1 650; SSE2-NEXT: movdqa %xmm1, %xmm0 651; SSE2-NEXT: pxor %xmm11, %xmm0 652; SSE2-NEXT: movdqa %xmm0, %xmm5 653; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 654; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 655; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 656; SSE2-NEXT: pand %xmm5, %xmm0 657; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 658; SSE2-NEXT: por %xmm0, %xmm5 659; SSE2-NEXT: pand %xmm1, %xmm5 660; SSE2-NEXT: movdqa %xmm4, %xmm0 661; SSE2-NEXT: pxor %xmm11, %xmm0 662; SSE2-NEXT: movdqa %xmm0, %xmm1 663; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 664; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 665; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 666; SSE2-NEXT: pand %xmm1, %xmm0 667; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 668; SSE2-NEXT: por %xmm0, %xmm1 669; SSE2-NEXT: pand %xmm4, %xmm1 670; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] 671; SSE2-NEXT: movdqa %xmm3, %xmm0 672; SSE2-NEXT: pxor %xmm11, %xmm0 673; SSE2-NEXT: movdqa %xmm0, %xmm4 674; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 675; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 676; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 677; SSE2-NEXT: pand %xmm4, %xmm0 678; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 679; SSE2-NEXT: por %xmm0, %xmm4 680; SSE2-NEXT: pand %xmm3, %xmm4 681; SSE2-NEXT: movdqa %xmm2, %xmm0 682; SSE2-NEXT: pxor %xmm11, %xmm0 683; SSE2-NEXT: movdqa %xmm0, %xmm3 684; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 685; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 686; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 687; SSE2-NEXT: pand %xmm3, %xmm5 688; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] 689; SSE2-NEXT: por %xmm5, %xmm0 690; SSE2-NEXT: pand %xmm2, %xmm0 691; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 692; SSE2-NEXT: retq 693; 694; SSSE3-LABEL: trunc_packus_v8i64_v8i32: 695; SSSE3: # %bb.0: 696; SSSE3-NEXT: movdqa (%rdi), %xmm3 697; SSSE3-NEXT: movdqa 16(%rdi), %xmm4 698; SSSE3-NEXT: movdqa 32(%rdi), %xmm6 699; SSSE3-NEXT: movdqa 48(%rdi), %xmm10 700; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] 701; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 702; SSSE3-NEXT: movdqa %xmm3, %xmm2 703; SSSE3-NEXT: pxor %xmm11, %xmm2 704; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 705; SSSE3-NEXT: pxor %xmm9, %xmm9 706; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 707; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483647,2147483647] 708; SSSE3-NEXT: movdqa %xmm1, %xmm5 709; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 710; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 711; SSSE3-NEXT: pand %xmm7, %xmm0 712; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 713; SSSE3-NEXT: por %xmm0, %xmm2 714; SSSE3-NEXT: pand %xmm2, %xmm3 715; SSSE3-NEXT: pandn %xmm8, %xmm2 716; SSSE3-NEXT: por %xmm3, %xmm2 717; SSSE3-NEXT: movdqa %xmm4, %xmm0 718; SSSE3-NEXT: pxor %xmm11, %xmm0 719; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 720; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 721; SSSE3-NEXT: movdqa %xmm1, %xmm5 722; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 723; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 724; SSSE3-NEXT: pand %xmm3, %xmm0 725; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 726; SSSE3-NEXT: por %xmm0, %xmm3 727; SSSE3-NEXT: pand %xmm3, %xmm4 728; SSSE3-NEXT: pandn %xmm8, %xmm3 729; SSSE3-NEXT: por %xmm4, %xmm3 730; SSSE3-NEXT: movdqa %xmm6, %xmm0 731; SSSE3-NEXT: pxor %xmm11, %xmm0 732; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 733; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 734; SSSE3-NEXT: movdqa %xmm1, %xmm5 735; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 736; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 737; SSSE3-NEXT: pand %xmm4, %xmm0 738; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] 739; SSSE3-NEXT: por %xmm0, %xmm4 740; SSSE3-NEXT: pand %xmm4, %xmm6 741; SSSE3-NEXT: pandn %xmm8, %xmm4 742; SSSE3-NEXT: por %xmm6, %xmm4 743; SSSE3-NEXT: movdqa %xmm10, %xmm0 744; SSSE3-NEXT: pxor %xmm11, %xmm0 745; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 746; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 747; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 748; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 749; SSSE3-NEXT: pand %xmm5, %xmm0 750; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 751; SSSE3-NEXT: por %xmm0, %xmm1 752; SSSE3-NEXT: pand %xmm1, %xmm10 753; SSSE3-NEXT: pandn %xmm8, %xmm1 754; SSSE3-NEXT: por %xmm10, %xmm1 755; SSSE3-NEXT: movdqa %xmm1, %xmm0 756; SSSE3-NEXT: pxor %xmm11, %xmm0 757; SSSE3-NEXT: movdqa %xmm0, %xmm5 758; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 759; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 760; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 761; SSSE3-NEXT: pand %xmm5, %xmm0 762; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 763; SSSE3-NEXT: por %xmm0, %xmm5 764; SSSE3-NEXT: pand %xmm1, %xmm5 765; SSSE3-NEXT: movdqa %xmm4, %xmm0 766; SSSE3-NEXT: pxor %xmm11, %xmm0 767; SSSE3-NEXT: movdqa %xmm0, %xmm1 768; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 769; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 770; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 771; SSSE3-NEXT: pand %xmm1, %xmm0 772; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 773; SSSE3-NEXT: por %xmm0, %xmm1 774; SSSE3-NEXT: pand %xmm4, %xmm1 775; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] 776; SSSE3-NEXT: movdqa %xmm3, %xmm0 777; SSSE3-NEXT: pxor %xmm11, %xmm0 778; SSSE3-NEXT: movdqa %xmm0, %xmm4 779; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 780; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 781; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 782; SSSE3-NEXT: pand %xmm4, %xmm0 783; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 784; SSSE3-NEXT: por %xmm0, %xmm4 785; SSSE3-NEXT: pand %xmm3, %xmm4 786; SSSE3-NEXT: movdqa %xmm2, %xmm0 787; SSSE3-NEXT: pxor %xmm11, %xmm0 788; SSSE3-NEXT: movdqa %xmm0, %xmm3 789; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 790; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 791; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 792; SSSE3-NEXT: pand %xmm3, %xmm5 793; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] 794; SSSE3-NEXT: por %xmm5, %xmm0 795; SSSE3-NEXT: pand %xmm2, %xmm0 796; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 797; SSSE3-NEXT: retq 798; 799; SSE41-LABEL: trunc_packus_v8i64_v8i32: 800; SSE41: # %bb.0: 801; SSE41-NEXT: movdqa (%rdi), %xmm5 802; SSE41-NEXT: movdqa 16(%rdi), %xmm4 803; SSE41-NEXT: movdqa 32(%rdi), %xmm10 804; SSE41-NEXT: movdqa 48(%rdi), %xmm9 805; SSE41-NEXT: movapd {{.*#+}} xmm1 = [4294967295,4294967295] 806; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 807; SSE41-NEXT: movdqa %xmm5, %xmm0 808; SSE41-NEXT: pxor %xmm3, %xmm0 809; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483647,2147483647] 810; SSE41-NEXT: movdqa %xmm2, %xmm7 811; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 812; SSE41-NEXT: movdqa %xmm2, %xmm6 813; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 814; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 815; SSE41-NEXT: pand %xmm7, %xmm0 816; SSE41-NEXT: por %xmm6, %xmm0 817; SSE41-NEXT: movapd %xmm1, %xmm8 818; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm8 819; SSE41-NEXT: movdqa %xmm4, %xmm0 820; SSE41-NEXT: pxor %xmm3, %xmm0 821; SSE41-NEXT: movdqa %xmm2, %xmm5 822; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 823; SSE41-NEXT: movdqa %xmm2, %xmm6 824; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 825; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 826; SSE41-NEXT: pand %xmm5, %xmm0 827; SSE41-NEXT: por %xmm6, %xmm0 828; SSE41-NEXT: movapd %xmm1, %xmm5 829; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm5 830; SSE41-NEXT: movdqa %xmm10, %xmm0 831; SSE41-NEXT: pxor %xmm3, %xmm0 832; SSE41-NEXT: movdqa %xmm2, %xmm4 833; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 834; SSE41-NEXT: movdqa %xmm2, %xmm6 835; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 836; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 837; SSE41-NEXT: pand %xmm4, %xmm0 838; SSE41-NEXT: por %xmm6, %xmm0 839; SSE41-NEXT: movapd %xmm1, %xmm4 840; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4 841; SSE41-NEXT: movdqa %xmm9, %xmm0 842; SSE41-NEXT: pxor %xmm3, %xmm0 843; SSE41-NEXT: movdqa %xmm2, %xmm6 844; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 845; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 846; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 847; SSE41-NEXT: pand %xmm6, %xmm0 848; SSE41-NEXT: por %xmm2, %xmm0 849; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 850; SSE41-NEXT: pxor %xmm2, %xmm2 851; SSE41-NEXT: movapd %xmm1, %xmm6 852; SSE41-NEXT: xorpd %xmm3, %xmm6 853; SSE41-NEXT: movapd %xmm6, %xmm7 854; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 855; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 856; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 857; SSE41-NEXT: pand %xmm7, %xmm0 858; SSE41-NEXT: por %xmm6, %xmm0 859; SSE41-NEXT: pxor %xmm6, %xmm6 860; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm6 861; SSE41-NEXT: movapd %xmm4, %xmm1 862; SSE41-NEXT: xorpd %xmm3, %xmm1 863; SSE41-NEXT: movapd %xmm1, %xmm7 864; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 865; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 866; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 867; SSE41-NEXT: pand %xmm7, %xmm0 868; SSE41-NEXT: por %xmm1, %xmm0 869; SSE41-NEXT: pxor %xmm1, %xmm1 870; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 871; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2] 872; SSE41-NEXT: movapd %xmm5, %xmm4 873; SSE41-NEXT: xorpd %xmm3, %xmm4 874; SSE41-NEXT: movapd %xmm4, %xmm6 875; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 876; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 877; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 878; SSE41-NEXT: pand %xmm6, %xmm0 879; SSE41-NEXT: por %xmm4, %xmm0 880; SSE41-NEXT: pxor %xmm4, %xmm4 881; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 882; SSE41-NEXT: movapd %xmm8, %xmm5 883; SSE41-NEXT: xorpd %xmm3, %xmm5 884; SSE41-NEXT: movapd %xmm5, %xmm6 885; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 886; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 887; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 888; SSE41-NEXT: pand %xmm6, %xmm0 889; SSE41-NEXT: por %xmm5, %xmm0 890; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 891; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] 892; SSE41-NEXT: movaps %xmm2, %xmm0 893; SSE41-NEXT: retq 894; 895; AVX1-LABEL: trunc_packus_v8i64_v8i32: 896; AVX1: # %bb.0: 897; AVX1-NEXT: vmovdqa (%rdi), %xmm0 898; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 899; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 900; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 901; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295] 902; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 903; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 904; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 905; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 906; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 907; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 908; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 909; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 910; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 911; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 912; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm2 913; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 914; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 915; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5 916; AVX1-NEXT: vpand %xmm3, %xmm5, %xmm3 917; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm4 918; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm1 919; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 920; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 921; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] 922; AVX1-NEXT: retq 923; 924; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: 925; AVX2-SLOW: # %bb.0: 926; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 927; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 928; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 929; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 930; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 931; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 932; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 933; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 934; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 935; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 936; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 937; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 938; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 939; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 940; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 941; AVX2-SLOW-NEXT: retq 942; 943; AVX2-FAST-ALL-LABEL: trunc_packus_v8i64_v8i32: 944; AVX2-FAST-ALL: # %bb.0: 945; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 946; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 947; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 948; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 949; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 950; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 951; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 952; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 953; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 954; AVX2-FAST-ALL-NEXT: vpand %ymm1, %ymm3, %ymm1 955; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 956; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm2, %ymm0 957; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 958; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 959; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 960; AVX2-FAST-ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 961; AVX2-FAST-ALL-NEXT: retq 962; 963; AVX2-FAST-PERLANE-LABEL: trunc_packus_v8i64_v8i32: 964; AVX2-FAST-PERLANE: # %bb.0: 965; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 966; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 967; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] 968; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 969; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 970; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 971; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 972; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 973; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 974; AVX2-FAST-PERLANE-NEXT: vpand %ymm1, %ymm3, %ymm1 975; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 976; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm2, %ymm0 977; AVX2-FAST-PERLANE-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] 978; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 979; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] 980; AVX2-FAST-PERLANE-NEXT: retq 981; 982; AVX512-LABEL: trunc_packus_v8i64_v8i32: 983; AVX512: # %bb.0: 984; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 985; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 986; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 987; AVX512-NEXT: retq 988; 989; SKX-LABEL: trunc_packus_v8i64_v8i32: 990; SKX: # %bb.0: 991; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 992; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm1 993; SKX-NEXT: vpmovusqd %ymm1, %xmm1 994; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm0 995; SKX-NEXT: vpmovusqd %ymm0, %xmm0 996; SKX-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 997; SKX-NEXT: retq 998 %a0 = load <8 x i64>, ptr %p0 999 %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1000 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 1001 %3 = icmp sgt <8 x i64> %2, zeroinitializer 1002 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 1003 %5 = trunc <8 x i64> %4 to <8 x i32> 1004 ret <8 x i32> %5 1005} 1006 1007; 1008; PACKUS saturation truncation to vXi16 1009; 1010 1011define <2 x i16> @trunc_packus_v2i64_v2i16(<2 x i64> %a0) { 1012; SSE2-LABEL: trunc_packus_v2i64_v2i16: 1013; SSE2: # %bb.0: 1014; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 1015; SSE2-NEXT: movdqa %xmm0, %xmm2 1016; SSE2-NEXT: pxor %xmm1, %xmm2 1017; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1018; SSE2-NEXT: pxor %xmm4, %xmm4 1019; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1020; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 1021; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1022; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1023; SSE2-NEXT: pand %xmm4, %xmm2 1024; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1025; SSE2-NEXT: por %xmm2, %xmm3 1026; SSE2-NEXT: pand %xmm3, %xmm0 1027; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1028; SSE2-NEXT: por %xmm0, %xmm3 1029; SSE2-NEXT: movdqa %xmm3, %xmm0 1030; SSE2-NEXT: pxor %xmm1, %xmm0 1031; SSE2-NEXT: movdqa %xmm0, %xmm2 1032; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 1033; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1034; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 1035; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1036; SSE2-NEXT: pand %xmm4, %xmm0 1037; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1038; SSE2-NEXT: por %xmm0, %xmm1 1039; SSE2-NEXT: pand %xmm3, %xmm1 1040; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1041; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1042; SSE2-NEXT: retq 1043; 1044; SSSE3-LABEL: trunc_packus_v2i64_v2i16: 1045; SSSE3: # %bb.0: 1046; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 1047; SSSE3-NEXT: movdqa %xmm0, %xmm2 1048; SSSE3-NEXT: pxor %xmm1, %xmm2 1049; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1050; SSSE3-NEXT: pxor %xmm4, %xmm4 1051; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 1052; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 1053; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 1054; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1055; SSSE3-NEXT: pand %xmm4, %xmm2 1056; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1057; SSSE3-NEXT: por %xmm2, %xmm3 1058; SSSE3-NEXT: pand %xmm3, %xmm0 1059; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1060; SSSE3-NEXT: por %xmm0, %xmm3 1061; SSSE3-NEXT: movdqa %xmm3, %xmm0 1062; SSSE3-NEXT: pxor %xmm1, %xmm0 1063; SSSE3-NEXT: movdqa %xmm0, %xmm2 1064; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 1065; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1066; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 1067; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1068; SSSE3-NEXT: pand %xmm4, %xmm0 1069; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1070; SSSE3-NEXT: por %xmm0, %xmm1 1071; SSSE3-NEXT: pand %xmm3, %xmm1 1072; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1073; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1074; SSSE3-NEXT: retq 1075; 1076; SSE41-LABEL: trunc_packus_v2i64_v2i16: 1077; SSE41: # %bb.0: 1078; SSE41-NEXT: movdqa %xmm0, %xmm1 1079; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] 1080; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 1081; SSE41-NEXT: pxor %xmm3, %xmm0 1082; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1083; SSE41-NEXT: movdqa %xmm4, %xmm5 1084; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 1085; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 1086; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1087; SSE41-NEXT: pand %xmm5, %xmm0 1088; SSE41-NEXT: por %xmm4, %xmm0 1089; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 1090; SSE41-NEXT: xorpd %xmm1, %xmm1 1091; SSE41-NEXT: movapd %xmm2, %xmm4 1092; SSE41-NEXT: xorpd %xmm3, %xmm4 1093; SSE41-NEXT: movapd %xmm4, %xmm5 1094; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 1095; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1096; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1097; SSE41-NEXT: pand %xmm5, %xmm0 1098; SSE41-NEXT: por %xmm4, %xmm0 1099; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 1100; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1101; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1102; SSE41-NEXT: retq 1103; 1104; AVX1-LABEL: trunc_packus_v2i64_v2i16: 1105; AVX1: # %bb.0: 1106; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1107; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1108; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1109; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1110; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1111; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 1112; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1113; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1114; AVX1-NEXT: retq 1115; 1116; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: 1117; AVX2-SLOW: # %bb.0: 1118; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1119; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1120; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1121; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1122; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1123; AVX2-SLOW-NEXT: vpand %xmm0, %xmm1, %xmm0 1124; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1125; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1126; AVX2-SLOW-NEXT: retq 1127; 1128; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: 1129; AVX2-FAST: # %bb.0: 1130; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1131; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1132; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1133; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 1134; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1135; AVX2-FAST-NEXT: vpand %xmm0, %xmm1, %xmm0 1136; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] 1137; AVX2-FAST-NEXT: retq 1138; 1139; AVX512F-LABEL: trunc_packus_v2i64_v2i16: 1140; AVX512F: # %bb.0: 1141; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1142; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1143; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1144; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1145; AVX512F-NEXT: vzeroupper 1146; AVX512F-NEXT: retq 1147; 1148; AVX512VL-LABEL: trunc_packus_v2i64_v2i16: 1149; AVX512VL: # %bb.0: 1150; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1151; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1152; AVX512VL-NEXT: vpmovusqw %xmm0, %xmm0 1153; AVX512VL-NEXT: retq 1154; 1155; AVX512BW-LABEL: trunc_packus_v2i64_v2i16: 1156; AVX512BW: # %bb.0: 1157; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1158; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1159; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1160; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1161; AVX512BW-NEXT: vzeroupper 1162; AVX512BW-NEXT: retq 1163; 1164; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16: 1165; AVX512BWVL: # %bb.0: 1166; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1167; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1168; AVX512BWVL-NEXT: vpmovusqw %xmm0, %xmm0 1169; AVX512BWVL-NEXT: retq 1170; 1171; SKX-LABEL: trunc_packus_v2i64_v2i16: 1172; SKX: # %bb.0: 1173; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1174; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1175; SKX-NEXT: vpmovusqw %xmm0, %xmm0 1176; SKX-NEXT: retq 1177 %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535> 1178 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535> 1179 %3 = icmp sgt <2 x i64> %2, zeroinitializer 1180 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 1181 %5 = trunc <2 x i64> %4 to <2 x i16> 1182 ret <2 x i16> %5 1183} 1184 1185define void @trunc_packus_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) { 1186; SSE2-LABEL: trunc_packus_v2i64_v2i16_store: 1187; SSE2: # %bb.0: 1188; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 1189; SSE2-NEXT: movdqa %xmm0, %xmm2 1190; SSE2-NEXT: pxor %xmm1, %xmm2 1191; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1192; SSE2-NEXT: pxor %xmm4, %xmm4 1193; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 1194; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 1195; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1196; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1197; SSE2-NEXT: pand %xmm4, %xmm2 1198; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1199; SSE2-NEXT: por %xmm2, %xmm3 1200; SSE2-NEXT: pand %xmm3, %xmm0 1201; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1202; SSE2-NEXT: por %xmm0, %xmm3 1203; SSE2-NEXT: movdqa %xmm3, %xmm0 1204; SSE2-NEXT: pxor %xmm1, %xmm0 1205; SSE2-NEXT: movdqa %xmm0, %xmm2 1206; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 1207; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1208; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 1209; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1210; SSE2-NEXT: pand %xmm4, %xmm0 1211; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1212; SSE2-NEXT: por %xmm0, %xmm1 1213; SSE2-NEXT: pand %xmm3, %xmm1 1214; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1215; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1216; SSE2-NEXT: movd %xmm0, (%rdi) 1217; SSE2-NEXT: retq 1218; 1219; SSSE3-LABEL: trunc_packus_v2i64_v2i16_store: 1220; SSSE3: # %bb.0: 1221; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 1222; SSSE3-NEXT: movdqa %xmm0, %xmm2 1223; SSSE3-NEXT: pxor %xmm1, %xmm2 1224; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 1225; SSSE3-NEXT: pxor %xmm4, %xmm4 1226; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 1227; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 1228; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 1229; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 1230; SSSE3-NEXT: pand %xmm4, %xmm2 1231; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1232; SSSE3-NEXT: por %xmm2, %xmm3 1233; SSSE3-NEXT: pand %xmm3, %xmm0 1234; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 1235; SSSE3-NEXT: por %xmm0, %xmm3 1236; SSSE3-NEXT: movdqa %xmm3, %xmm0 1237; SSSE3-NEXT: pxor %xmm1, %xmm0 1238; SSSE3-NEXT: movdqa %xmm0, %xmm2 1239; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 1240; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 1241; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 1242; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1243; SSSE3-NEXT: pand %xmm4, %xmm0 1244; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 1245; SSSE3-NEXT: por %xmm0, %xmm1 1246; SSSE3-NEXT: pand %xmm3, %xmm1 1247; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1248; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1249; SSSE3-NEXT: movd %xmm0, (%rdi) 1250; SSSE3-NEXT: retq 1251; 1252; SSE41-LABEL: trunc_packus_v2i64_v2i16_store: 1253; SSE41: # %bb.0: 1254; SSE41-NEXT: movdqa %xmm0, %xmm1 1255; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65535,65535] 1256; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 1257; SSE41-NEXT: pxor %xmm3, %xmm0 1258; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1259; SSE41-NEXT: movdqa %xmm4, %xmm5 1260; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 1261; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 1262; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1263; SSE41-NEXT: pand %xmm5, %xmm0 1264; SSE41-NEXT: por %xmm4, %xmm0 1265; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 1266; SSE41-NEXT: xorpd %xmm1, %xmm1 1267; SSE41-NEXT: movapd %xmm2, %xmm4 1268; SSE41-NEXT: xorpd %xmm3, %xmm4 1269; SSE41-NEXT: movapd %xmm4, %xmm5 1270; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 1271; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1272; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1273; SSE41-NEXT: pand %xmm5, %xmm0 1274; SSE41-NEXT: por %xmm4, %xmm0 1275; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 1276; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1277; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1278; SSE41-NEXT: movd %xmm0, (%rdi) 1279; SSE41-NEXT: retq 1280; 1281; AVX1-LABEL: trunc_packus_v2i64_v2i16_store: 1282; AVX1: # %bb.0: 1283; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1284; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1285; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1286; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1287; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1288; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 1289; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1290; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1291; AVX1-NEXT: vmovd %xmm0, (%rdi) 1292; AVX1-NEXT: retq 1293; 1294; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: 1295; AVX2-SLOW: # %bb.0: 1296; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1297; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1298; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1299; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1300; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1301; AVX2-SLOW-NEXT: vpand %xmm0, %xmm1, %xmm0 1302; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1303; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1304; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) 1305; AVX2-SLOW-NEXT: retq 1306; 1307; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: 1308; AVX2-FAST: # %bb.0: 1309; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1310; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1311; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1312; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 1313; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 1314; AVX2-FAST-NEXT: vpand %xmm0, %xmm1, %xmm0 1315; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] 1316; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) 1317; AVX2-FAST-NEXT: retq 1318; 1319; AVX512F-LABEL: trunc_packus_v2i64_v2i16_store: 1320; AVX512F: # %bb.0: 1321; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1322; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1323; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1324; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1325; AVX512F-NEXT: vmovd %xmm0, (%rdi) 1326; AVX512F-NEXT: vzeroupper 1327; AVX512F-NEXT: retq 1328; 1329; AVX512VL-LABEL: trunc_packus_v2i64_v2i16_store: 1330; AVX512VL: # %bb.0: 1331; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1332; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1333; AVX512VL-NEXT: vpmovusqw %xmm0, (%rdi) 1334; AVX512VL-NEXT: retq 1335; 1336; AVX512BW-LABEL: trunc_packus_v2i64_v2i16_store: 1337; AVX512BW: # %bb.0: 1338; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 1339; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1340; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1341; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1342; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 1343; AVX512BW-NEXT: vzeroupper 1344; AVX512BW-NEXT: retq 1345; 1346; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i16_store: 1347; AVX512BWVL: # %bb.0: 1348; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1349; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1350; AVX512BWVL-NEXT: vpmovusqw %xmm0, (%rdi) 1351; AVX512BWVL-NEXT: retq 1352; 1353; SKX-LABEL: trunc_packus_v2i64_v2i16_store: 1354; SKX: # %bb.0: 1355; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1356; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 1357; SKX-NEXT: vpmovusqw %xmm0, (%rdi) 1358; SKX-NEXT: retq 1359 %1 = icmp slt <2 x i64> %a0, <i64 65535, i64 65535> 1360 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 65535, i64 65535> 1361 %3 = icmp sgt <2 x i64> %2, zeroinitializer 1362 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 1363 %5 = trunc <2 x i64> %4 to <2 x i16> 1364 store <2 x i16> %5, ptr%p1 1365 ret void 1366} 1367 1368define <4 x i16> @trunc_packus_v4i64_v4i16(<4 x i64> %a0) { 1369; SSE2-LABEL: trunc_packus_v4i64_v4i16: 1370; SSE2: # %bb.0: 1371; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 1372; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1373; SSE2-NEXT: movdqa %xmm1, %xmm3 1374; SSE2-NEXT: pxor %xmm2, %xmm3 1375; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1376; SSE2-NEXT: pxor %xmm9, %xmm9 1377; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 1378; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1379; SSE2-NEXT: movdqa %xmm4, %xmm7 1380; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 1381; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 1382; SSE2-NEXT: pand %xmm5, %xmm6 1383; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 1384; SSE2-NEXT: por %xmm6, %xmm3 1385; SSE2-NEXT: pand %xmm3, %xmm1 1386; SSE2-NEXT: pandn %xmm8, %xmm3 1387; SSE2-NEXT: por %xmm1, %xmm3 1388; SSE2-NEXT: movdqa %xmm0, %xmm1 1389; SSE2-NEXT: pxor %xmm2, %xmm1 1390; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1391; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 1392; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 1393; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 1394; SSE2-NEXT: pand %xmm5, %xmm1 1395; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1396; SSE2-NEXT: por %xmm1, %xmm4 1397; SSE2-NEXT: pand %xmm4, %xmm0 1398; SSE2-NEXT: pandn %xmm8, %xmm4 1399; SSE2-NEXT: por %xmm0, %xmm4 1400; SSE2-NEXT: movdqa %xmm4, %xmm0 1401; SSE2-NEXT: pxor %xmm2, %xmm0 1402; SSE2-NEXT: movdqa %xmm0, %xmm1 1403; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 1404; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] 1405; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 1406; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1407; SSE2-NEXT: pand %xmm5, %xmm0 1408; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1409; SSE2-NEXT: por %xmm0, %xmm1 1410; SSE2-NEXT: pand %xmm4, %xmm1 1411; SSE2-NEXT: movdqa %xmm3, %xmm0 1412; SSE2-NEXT: pxor %xmm2, %xmm0 1413; SSE2-NEXT: movdqa %xmm0, %xmm4 1414; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 1415; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1416; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 1417; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1418; SSE2-NEXT: pand %xmm5, %xmm0 1419; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1420; SSE2-NEXT: por %xmm0, %xmm2 1421; SSE2-NEXT: pand %xmm3, %xmm2 1422; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1423; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 1424; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1425; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1426; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1427; SSE2-NEXT: retq 1428; 1429; SSSE3-LABEL: trunc_packus_v4i64_v4i16: 1430; SSSE3: # %bb.0: 1431; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 1432; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1433; SSSE3-NEXT: movdqa %xmm1, %xmm3 1434; SSSE3-NEXT: pxor %xmm2, %xmm3 1435; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1436; SSSE3-NEXT: pxor %xmm9, %xmm9 1437; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 1438; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1439; SSSE3-NEXT: movdqa %xmm4, %xmm7 1440; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 1441; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 1442; SSSE3-NEXT: pand %xmm5, %xmm6 1443; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 1444; SSSE3-NEXT: por %xmm6, %xmm3 1445; SSSE3-NEXT: pand %xmm3, %xmm1 1446; SSSE3-NEXT: pandn %xmm8, %xmm3 1447; SSSE3-NEXT: por %xmm1, %xmm3 1448; SSSE3-NEXT: movdqa %xmm0, %xmm1 1449; SSSE3-NEXT: pxor %xmm2, %xmm1 1450; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1451; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 1452; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 1453; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 1454; SSSE3-NEXT: pand %xmm5, %xmm1 1455; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1456; SSSE3-NEXT: por %xmm1, %xmm4 1457; SSSE3-NEXT: pand %xmm4, %xmm0 1458; SSSE3-NEXT: pandn %xmm8, %xmm4 1459; SSSE3-NEXT: por %xmm0, %xmm4 1460; SSSE3-NEXT: movdqa %xmm4, %xmm0 1461; SSSE3-NEXT: pxor %xmm2, %xmm0 1462; SSSE3-NEXT: movdqa %xmm0, %xmm1 1463; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 1464; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] 1465; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1466; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1467; SSSE3-NEXT: pand %xmm5, %xmm0 1468; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1469; SSSE3-NEXT: por %xmm0, %xmm1 1470; SSSE3-NEXT: pand %xmm4, %xmm1 1471; SSSE3-NEXT: movdqa %xmm3, %xmm0 1472; SSSE3-NEXT: pxor %xmm2, %xmm0 1473; SSSE3-NEXT: movdqa %xmm0, %xmm4 1474; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 1475; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1476; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1477; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1478; SSSE3-NEXT: pand %xmm5, %xmm0 1479; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1480; SSSE3-NEXT: por %xmm0, %xmm2 1481; SSSE3-NEXT: pand %xmm3, %xmm2 1482; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1483; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] 1484; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1485; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1486; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1487; SSSE3-NEXT: retq 1488; 1489; SSE41-LABEL: trunc_packus_v4i64_v4i16: 1490; SSE41: # %bb.0: 1491; SSE41-NEXT: movdqa %xmm0, %xmm2 1492; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] 1493; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 1494; SSE41-NEXT: pxor %xmm3, %xmm0 1495; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] 1496; SSE41-NEXT: movdqa %xmm6, %xmm5 1497; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 1498; SSE41-NEXT: movdqa %xmm6, %xmm7 1499; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 1500; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 1501; SSE41-NEXT: pand %xmm5, %xmm0 1502; SSE41-NEXT: por %xmm7, %xmm0 1503; SSE41-NEXT: movapd %xmm4, %xmm5 1504; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 1505; SSE41-NEXT: movdqa %xmm1, %xmm0 1506; SSE41-NEXT: pxor %xmm3, %xmm0 1507; SSE41-NEXT: movdqa %xmm6, %xmm2 1508; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 1509; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1510; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1511; SSE41-NEXT: pand %xmm2, %xmm0 1512; SSE41-NEXT: por %xmm6, %xmm0 1513; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 1514; SSE41-NEXT: xorpd %xmm1, %xmm1 1515; SSE41-NEXT: movapd %xmm4, %xmm2 1516; SSE41-NEXT: xorpd %xmm3, %xmm2 1517; SSE41-NEXT: movapd %xmm2, %xmm6 1518; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1519; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 1520; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 1521; SSE41-NEXT: pand %xmm6, %xmm0 1522; SSE41-NEXT: por %xmm2, %xmm0 1523; SSE41-NEXT: pxor %xmm2, %xmm2 1524; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 1525; SSE41-NEXT: movapd %xmm5, %xmm4 1526; SSE41-NEXT: xorpd %xmm3, %xmm4 1527; SSE41-NEXT: movapd %xmm4, %xmm6 1528; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1529; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1530; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1531; SSE41-NEXT: pand %xmm6, %xmm0 1532; SSE41-NEXT: por %xmm4, %xmm0 1533; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 1534; SSE41-NEXT: packusdw %xmm2, %xmm1 1535; SSE41-NEXT: packusdw %xmm1, %xmm1 1536; SSE41-NEXT: movdqa %xmm1, %xmm0 1537; SSE41-NEXT: retq 1538; 1539; AVX1-LABEL: trunc_packus_v4i64_v4i16: 1540; AVX1: # %bb.0: 1541; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1542; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1543; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 1544; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1545; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 1546; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 1547; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1548; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 1549; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 1550; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 1551; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1552; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1553; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1554; AVX1-NEXT: vzeroupper 1555; AVX1-NEXT: retq 1556; 1557; AVX2-LABEL: trunc_packus_v4i64_v4i16: 1558; AVX2: # %bb.0: 1559; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] 1560; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 1561; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 1562; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1563; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 1564; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 1565; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1566; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1567; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1568; AVX2-NEXT: vzeroupper 1569; AVX2-NEXT: retq 1570; 1571; AVX512F-LABEL: trunc_packus_v4i64_v4i16: 1572; AVX512F: # %bb.0: 1573; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1574; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1575; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1576; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1577; AVX512F-NEXT: vzeroupper 1578; AVX512F-NEXT: retq 1579; 1580; AVX512VL-LABEL: trunc_packus_v4i64_v4i16: 1581; AVX512VL: # %bb.0: 1582; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1583; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1584; AVX512VL-NEXT: vpmovusqw %ymm0, %xmm0 1585; AVX512VL-NEXT: vzeroupper 1586; AVX512VL-NEXT: retq 1587; 1588; AVX512BW-LABEL: trunc_packus_v4i64_v4i16: 1589; AVX512BW: # %bb.0: 1590; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1591; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1592; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1593; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1594; AVX512BW-NEXT: vzeroupper 1595; AVX512BW-NEXT: retq 1596; 1597; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16: 1598; AVX512BWVL: # %bb.0: 1599; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1600; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1601; AVX512BWVL-NEXT: vpmovusqw %ymm0, %xmm0 1602; AVX512BWVL-NEXT: vzeroupper 1603; AVX512BWVL-NEXT: retq 1604; 1605; SKX-LABEL: trunc_packus_v4i64_v4i16: 1606; SKX: # %bb.0: 1607; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1608; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1609; SKX-NEXT: vpmovusqw %ymm0, %xmm0 1610; SKX-NEXT: vzeroupper 1611; SKX-NEXT: retq 1612 %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535> 1613 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535> 1614 %3 = icmp sgt <4 x i64> %2, zeroinitializer 1615 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 1616 %5 = trunc <4 x i64> %4 to <4 x i16> 1617 ret <4 x i16> %5 1618} 1619 1620define void @trunc_packus_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) { 1621; SSE2-LABEL: trunc_packus_v4i64_v4i16_store: 1622; SSE2: # %bb.0: 1623; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 1624; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1625; SSE2-NEXT: movdqa %xmm1, %xmm3 1626; SSE2-NEXT: pxor %xmm2, %xmm3 1627; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1628; SSE2-NEXT: pxor %xmm9, %xmm9 1629; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 1630; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1631; SSE2-NEXT: movdqa %xmm4, %xmm7 1632; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 1633; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 1634; SSE2-NEXT: pand %xmm5, %xmm6 1635; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 1636; SSE2-NEXT: por %xmm6, %xmm3 1637; SSE2-NEXT: pand %xmm3, %xmm1 1638; SSE2-NEXT: pandn %xmm8, %xmm3 1639; SSE2-NEXT: por %xmm1, %xmm3 1640; SSE2-NEXT: movdqa %xmm0, %xmm1 1641; SSE2-NEXT: pxor %xmm2, %xmm1 1642; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1643; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 1644; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 1645; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 1646; SSE2-NEXT: pand %xmm5, %xmm1 1647; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1648; SSE2-NEXT: por %xmm1, %xmm4 1649; SSE2-NEXT: pand %xmm4, %xmm0 1650; SSE2-NEXT: pandn %xmm8, %xmm4 1651; SSE2-NEXT: por %xmm0, %xmm4 1652; SSE2-NEXT: movdqa %xmm4, %xmm0 1653; SSE2-NEXT: pxor %xmm2, %xmm0 1654; SSE2-NEXT: movdqa %xmm0, %xmm1 1655; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 1656; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] 1657; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 1658; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1659; SSE2-NEXT: pand %xmm5, %xmm0 1660; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1661; SSE2-NEXT: por %xmm0, %xmm1 1662; SSE2-NEXT: pand %xmm4, %xmm1 1663; SSE2-NEXT: movdqa %xmm3, %xmm0 1664; SSE2-NEXT: pxor %xmm2, %xmm0 1665; SSE2-NEXT: movdqa %xmm0, %xmm4 1666; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 1667; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1668; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 1669; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1670; SSE2-NEXT: pand %xmm5, %xmm0 1671; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1672; SSE2-NEXT: por %xmm0, %xmm2 1673; SSE2-NEXT: pand %xmm3, %xmm2 1674; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1675; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1676; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1677; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1678; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1679; SSE2-NEXT: movq %xmm1, (%rdi) 1680; SSE2-NEXT: retq 1681; 1682; SSSE3-LABEL: trunc_packus_v4i64_v4i16_store: 1683; SSSE3: # %bb.0: 1684; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 1685; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 1686; SSSE3-NEXT: movdqa %xmm1, %xmm3 1687; SSSE3-NEXT: pxor %xmm2, %xmm3 1688; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1689; SSSE3-NEXT: pxor %xmm9, %xmm9 1690; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 1691; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 1692; SSSE3-NEXT: movdqa %xmm4, %xmm7 1693; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 1694; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 1695; SSSE3-NEXT: pand %xmm5, %xmm6 1696; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 1697; SSSE3-NEXT: por %xmm6, %xmm3 1698; SSSE3-NEXT: pand %xmm3, %xmm1 1699; SSSE3-NEXT: pandn %xmm8, %xmm3 1700; SSSE3-NEXT: por %xmm1, %xmm3 1701; SSSE3-NEXT: movdqa %xmm0, %xmm1 1702; SSSE3-NEXT: pxor %xmm2, %xmm1 1703; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 1704; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 1705; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 1706; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 1707; SSSE3-NEXT: pand %xmm5, %xmm1 1708; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1709; SSSE3-NEXT: por %xmm1, %xmm4 1710; SSSE3-NEXT: pand %xmm4, %xmm0 1711; SSSE3-NEXT: pandn %xmm8, %xmm4 1712; SSSE3-NEXT: por %xmm0, %xmm4 1713; SSSE3-NEXT: movdqa %xmm4, %xmm0 1714; SSSE3-NEXT: pxor %xmm2, %xmm0 1715; SSSE3-NEXT: movdqa %xmm0, %xmm1 1716; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 1717; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] 1718; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1719; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1720; SSSE3-NEXT: pand %xmm5, %xmm0 1721; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 1722; SSSE3-NEXT: por %xmm0, %xmm1 1723; SSSE3-NEXT: pand %xmm4, %xmm1 1724; SSSE3-NEXT: movdqa %xmm3, %xmm0 1725; SSSE3-NEXT: pxor %xmm2, %xmm0 1726; SSSE3-NEXT: movdqa %xmm0, %xmm4 1727; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 1728; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 1729; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 1730; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1731; SSSE3-NEXT: pand %xmm5, %xmm0 1732; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 1733; SSSE3-NEXT: por %xmm0, %xmm2 1734; SSSE3-NEXT: pand %xmm3, %xmm2 1735; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1736; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1737; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1738; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1739; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1740; SSSE3-NEXT: movq %xmm1, (%rdi) 1741; SSSE3-NEXT: retq 1742; 1743; SSE41-LABEL: trunc_packus_v4i64_v4i16_store: 1744; SSE41: # %bb.0: 1745; SSE41-NEXT: movdqa %xmm0, %xmm2 1746; SSE41-NEXT: movapd {{.*#+}} xmm4 = [65535,65535] 1747; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 1748; SSE41-NEXT: pxor %xmm3, %xmm0 1749; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147549183,2147549183] 1750; SSE41-NEXT: movdqa %xmm6, %xmm5 1751; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 1752; SSE41-NEXT: movdqa %xmm6, %xmm7 1753; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 1754; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 1755; SSE41-NEXT: pand %xmm5, %xmm0 1756; SSE41-NEXT: por %xmm7, %xmm0 1757; SSE41-NEXT: movapd %xmm4, %xmm5 1758; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 1759; SSE41-NEXT: movdqa %xmm1, %xmm0 1760; SSE41-NEXT: pxor %xmm3, %xmm0 1761; SSE41-NEXT: movdqa %xmm6, %xmm2 1762; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 1763; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 1764; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 1765; SSE41-NEXT: pand %xmm2, %xmm0 1766; SSE41-NEXT: por %xmm6, %xmm0 1767; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 1768; SSE41-NEXT: xorpd %xmm1, %xmm1 1769; SSE41-NEXT: movapd %xmm4, %xmm2 1770; SSE41-NEXT: xorpd %xmm3, %xmm2 1771; SSE41-NEXT: movapd %xmm2, %xmm6 1772; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1773; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 1774; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 1775; SSE41-NEXT: pand %xmm6, %xmm0 1776; SSE41-NEXT: por %xmm2, %xmm0 1777; SSE41-NEXT: pxor %xmm2, %xmm2 1778; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 1779; SSE41-NEXT: movapd %xmm5, %xmm4 1780; SSE41-NEXT: xorpd %xmm3, %xmm4 1781; SSE41-NEXT: movapd %xmm4, %xmm6 1782; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 1783; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 1784; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 1785; SSE41-NEXT: pand %xmm6, %xmm0 1786; SSE41-NEXT: por %xmm4, %xmm0 1787; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 1788; SSE41-NEXT: packusdw %xmm2, %xmm1 1789; SSE41-NEXT: packusdw %xmm1, %xmm1 1790; SSE41-NEXT: movq %xmm1, (%rdi) 1791; SSE41-NEXT: retq 1792; 1793; AVX1-LABEL: trunc_packus_v4i64_v4i16_store: 1794; AVX1: # %bb.0: 1795; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] 1796; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 1797; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm2 1798; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1799; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3 1800; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 1801; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1802; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3 1803; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 1804; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 1805; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 1806; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1807; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1808; AVX1-NEXT: vmovq %xmm0, (%rdi) 1809; AVX1-NEXT: vzeroupper 1810; AVX1-NEXT: retq 1811; 1812; AVX2-LABEL: trunc_packus_v4i64_v4i16_store: 1813; AVX2: # %bb.0: 1814; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] 1815; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 1816; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 1817; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1818; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 1819; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 1820; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1821; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1822; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 1823; AVX2-NEXT: vmovq %xmm0, (%rdi) 1824; AVX2-NEXT: vzeroupper 1825; AVX2-NEXT: retq 1826; 1827; AVX512F-LABEL: trunc_packus_v4i64_v4i16_store: 1828; AVX512F: # %bb.0: 1829; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1830; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1831; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1832; AVX512F-NEXT: vpmovusqw %zmm0, %xmm0 1833; AVX512F-NEXT: vmovq %xmm0, (%rdi) 1834; AVX512F-NEXT: vzeroupper 1835; AVX512F-NEXT: retq 1836; 1837; AVX512VL-LABEL: trunc_packus_v4i64_v4i16_store: 1838; AVX512VL: # %bb.0: 1839; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1840; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1841; AVX512VL-NEXT: vpmovusqw %ymm0, (%rdi) 1842; AVX512VL-NEXT: vzeroupper 1843; AVX512VL-NEXT: retq 1844; 1845; AVX512BW-LABEL: trunc_packus_v4i64_v4i16_store: 1846; AVX512BW: # %bb.0: 1847; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1848; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1849; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 1850; AVX512BW-NEXT: vpmovusqw %zmm0, %xmm0 1851; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 1852; AVX512BW-NEXT: vzeroupper 1853; AVX512BW-NEXT: retq 1854; 1855; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i16_store: 1856; AVX512BWVL: # %bb.0: 1857; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 1858; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1859; AVX512BWVL-NEXT: vpmovusqw %ymm0, (%rdi) 1860; AVX512BWVL-NEXT: vzeroupper 1861; AVX512BWVL-NEXT: retq 1862; 1863; SKX-LABEL: trunc_packus_v4i64_v4i16_store: 1864; SKX: # %bb.0: 1865; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 1866; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 1867; SKX-NEXT: vpmovusqw %ymm0, (%rdi) 1868; SKX-NEXT: vzeroupper 1869; SKX-NEXT: retq 1870 %1 = icmp slt <4 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535> 1871 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 65535, i64 65535, i64 65535, i64 65535> 1872 %3 = icmp sgt <4 x i64> %2, zeroinitializer 1873 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 1874 %5 = trunc <4 x i64> %4 to <4 x i16> 1875 store <4 x i16> %5, ptr%p1 1876 ret void 1877} 1878 1879define <8 x i16> @trunc_packus_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256" { 1880; SSE2-LABEL: trunc_packus_v8i64_v8i16: 1881; SSE2: # %bb.0: 1882; SSE2-NEXT: movdqa (%rdi), %xmm4 1883; SSE2-NEXT: movdqa 16(%rdi), %xmm2 1884; SSE2-NEXT: movdqa 32(%rdi), %xmm10 1885; SSE2-NEXT: movdqa 48(%rdi), %xmm6 1886; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 1887; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 1888; SSE2-NEXT: movdqa %xmm2, %xmm1 1889; SSE2-NEXT: pxor %xmm11, %xmm1 1890; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 1891; SSE2-NEXT: pxor %xmm9, %xmm9 1892; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 1893; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 1894; SSE2-NEXT: movdqa %xmm3, %xmm5 1895; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 1896; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 1897; SSE2-NEXT: pand %xmm7, %xmm0 1898; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] 1899; SSE2-NEXT: por %xmm0, %xmm1 1900; SSE2-NEXT: pand %xmm1, %xmm2 1901; SSE2-NEXT: pandn %xmm8, %xmm1 1902; SSE2-NEXT: por %xmm2, %xmm1 1903; SSE2-NEXT: movdqa %xmm4, %xmm0 1904; SSE2-NEXT: pxor %xmm11, %xmm0 1905; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 1906; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 1907; SSE2-NEXT: movdqa %xmm3, %xmm5 1908; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 1909; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 1910; SSE2-NEXT: pand %xmm2, %xmm0 1911; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 1912; SSE2-NEXT: por %xmm0, %xmm2 1913; SSE2-NEXT: pand %xmm2, %xmm4 1914; SSE2-NEXT: pandn %xmm8, %xmm2 1915; SSE2-NEXT: por %xmm4, %xmm2 1916; SSE2-NEXT: movdqa %xmm6, %xmm0 1917; SSE2-NEXT: pxor %xmm11, %xmm0 1918; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1919; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 1920; SSE2-NEXT: movdqa %xmm3, %xmm5 1921; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 1922; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 1923; SSE2-NEXT: pand %xmm4, %xmm0 1924; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] 1925; SSE2-NEXT: por %xmm0, %xmm4 1926; SSE2-NEXT: pand %xmm4, %xmm6 1927; SSE2-NEXT: pandn %xmm8, %xmm4 1928; SSE2-NEXT: por %xmm6, %xmm4 1929; SSE2-NEXT: movdqa %xmm10, %xmm0 1930; SSE2-NEXT: pxor %xmm11, %xmm0 1931; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 1932; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 1933; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 1934; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 1935; SSE2-NEXT: pand %xmm5, %xmm0 1936; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 1937; SSE2-NEXT: por %xmm0, %xmm5 1938; SSE2-NEXT: pand %xmm5, %xmm10 1939; SSE2-NEXT: pandn %xmm8, %xmm5 1940; SSE2-NEXT: por %xmm10, %xmm5 1941; SSE2-NEXT: movdqa %xmm5, %xmm0 1942; SSE2-NEXT: pxor %xmm11, %xmm0 1943; SSE2-NEXT: movdqa %xmm0, %xmm3 1944; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 1945; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 1946; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 1947; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1948; SSE2-NEXT: pand %xmm6, %xmm0 1949; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1950; SSE2-NEXT: por %xmm0, %xmm3 1951; SSE2-NEXT: pand %xmm5, %xmm3 1952; SSE2-NEXT: movdqa %xmm4, %xmm0 1953; SSE2-NEXT: pxor %xmm11, %xmm0 1954; SSE2-NEXT: movdqa %xmm0, %xmm5 1955; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 1956; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 1957; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 1958; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1959; SSE2-NEXT: pand %xmm6, %xmm0 1960; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1961; SSE2-NEXT: por %xmm0, %xmm5 1962; SSE2-NEXT: pand %xmm4, %xmm5 1963; SSE2-NEXT: movdqa %xmm2, %xmm0 1964; SSE2-NEXT: pxor %xmm11, %xmm0 1965; SSE2-NEXT: movdqa %xmm0, %xmm4 1966; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 1967; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 1968; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 1969; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1970; SSE2-NEXT: pand %xmm6, %xmm0 1971; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1972; SSE2-NEXT: por %xmm0, %xmm4 1973; SSE2-NEXT: pand %xmm2, %xmm4 1974; SSE2-NEXT: movdqa %xmm1, %xmm0 1975; SSE2-NEXT: pxor %xmm11, %xmm0 1976; SSE2-NEXT: movdqa %xmm0, %xmm2 1977; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 1978; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] 1979; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 1980; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 1981; SSE2-NEXT: pand %xmm6, %xmm0 1982; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1983; SSE2-NEXT: por %xmm0, %xmm2 1984; SSE2-NEXT: pand %xmm1, %xmm2 1985; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1986; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 1987; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] 1988; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1989; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1990; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 1991; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] 1992; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1993; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1994; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1995; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1996; SSE2-NEXT: retq 1997; 1998; SSSE3-LABEL: trunc_packus_v8i64_v8i16: 1999; SSSE3: # %bb.0: 2000; SSSE3-NEXT: movdqa (%rdi), %xmm4 2001; SSSE3-NEXT: movdqa 16(%rdi), %xmm2 2002; SSSE3-NEXT: movdqa 32(%rdi), %xmm10 2003; SSSE3-NEXT: movdqa 48(%rdi), %xmm6 2004; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] 2005; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 2006; SSSE3-NEXT: movdqa %xmm2, %xmm1 2007; SSSE3-NEXT: pxor %xmm11, %xmm1 2008; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 2009; SSSE3-NEXT: pxor %xmm9, %xmm9 2010; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 2011; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147549183,2147549183] 2012; SSSE3-NEXT: movdqa %xmm3, %xmm5 2013; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 2014; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 2015; SSSE3-NEXT: pand %xmm7, %xmm0 2016; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] 2017; SSSE3-NEXT: por %xmm0, %xmm1 2018; SSSE3-NEXT: pand %xmm1, %xmm2 2019; SSSE3-NEXT: pandn %xmm8, %xmm1 2020; SSSE3-NEXT: por %xmm2, %xmm1 2021; SSSE3-NEXT: movdqa %xmm4, %xmm0 2022; SSSE3-NEXT: pxor %xmm11, %xmm0 2023; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 2024; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 2025; SSSE3-NEXT: movdqa %xmm3, %xmm5 2026; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 2027; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 2028; SSSE3-NEXT: pand %xmm2, %xmm0 2029; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 2030; SSSE3-NEXT: por %xmm0, %xmm2 2031; SSSE3-NEXT: pand %xmm2, %xmm4 2032; SSSE3-NEXT: pandn %xmm8, %xmm2 2033; SSSE3-NEXT: por %xmm4, %xmm2 2034; SSSE3-NEXT: movdqa %xmm6, %xmm0 2035; SSSE3-NEXT: pxor %xmm11, %xmm0 2036; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2037; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 2038; SSSE3-NEXT: movdqa %xmm3, %xmm5 2039; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 2040; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 2041; SSSE3-NEXT: pand %xmm4, %xmm0 2042; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] 2043; SSSE3-NEXT: por %xmm0, %xmm4 2044; SSSE3-NEXT: pand %xmm4, %xmm6 2045; SSSE3-NEXT: pandn %xmm8, %xmm4 2046; SSSE3-NEXT: por %xmm6, %xmm4 2047; SSSE3-NEXT: movdqa %xmm10, %xmm0 2048; SSSE3-NEXT: pxor %xmm11, %xmm0 2049; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 2050; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 2051; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 2052; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 2053; SSSE3-NEXT: pand %xmm5, %xmm0 2054; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 2055; SSSE3-NEXT: por %xmm0, %xmm5 2056; SSSE3-NEXT: pand %xmm5, %xmm10 2057; SSSE3-NEXT: pandn %xmm8, %xmm5 2058; SSSE3-NEXT: por %xmm10, %xmm5 2059; SSSE3-NEXT: movdqa %xmm5, %xmm0 2060; SSSE3-NEXT: pxor %xmm11, %xmm0 2061; SSSE3-NEXT: movdqa %xmm0, %xmm3 2062; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 2063; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 2064; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 2065; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2066; SSSE3-NEXT: pand %xmm6, %xmm0 2067; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2068; SSSE3-NEXT: por %xmm0, %xmm3 2069; SSSE3-NEXT: pand %xmm5, %xmm3 2070; SSSE3-NEXT: movdqa %xmm4, %xmm0 2071; SSSE3-NEXT: pxor %xmm11, %xmm0 2072; SSSE3-NEXT: movdqa %xmm0, %xmm5 2073; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 2074; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 2075; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 2076; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2077; SSSE3-NEXT: pand %xmm6, %xmm0 2078; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 2079; SSSE3-NEXT: por %xmm0, %xmm5 2080; SSSE3-NEXT: pand %xmm4, %xmm5 2081; SSSE3-NEXT: movdqa %xmm2, %xmm0 2082; SSSE3-NEXT: pxor %xmm11, %xmm0 2083; SSSE3-NEXT: movdqa %xmm0, %xmm4 2084; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 2085; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] 2086; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 2087; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2088; SSSE3-NEXT: pand %xmm6, %xmm0 2089; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 2090; SSSE3-NEXT: por %xmm0, %xmm4 2091; SSSE3-NEXT: pand %xmm2, %xmm4 2092; SSSE3-NEXT: movdqa %xmm1, %xmm0 2093; SSSE3-NEXT: pxor %xmm11, %xmm0 2094; SSSE3-NEXT: movdqa %xmm0, %xmm2 2095; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 2096; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] 2097; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 2098; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2099; SSSE3-NEXT: pand %xmm6, %xmm0 2100; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2101; SSSE3-NEXT: por %xmm0, %xmm2 2102; SSSE3-NEXT: pand %xmm1, %xmm2 2103; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2104; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] 2105; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] 2106; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2107; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2108; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] 2109; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] 2110; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2111; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2112; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 2113; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 2114; SSSE3-NEXT: retq 2115; 2116; SSE41-LABEL: trunc_packus_v8i64_v8i16: 2117; SSE41: # %bb.0: 2118; SSE41-NEXT: movdqa (%rdi), %xmm10 2119; SSE41-NEXT: movdqa 16(%rdi), %xmm9 2120; SSE41-NEXT: movdqa 32(%rdi), %xmm3 2121; SSE41-NEXT: movdqa 48(%rdi), %xmm5 2122; SSE41-NEXT: movapd {{.*#+}} xmm1 = [65535,65535] 2123; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 2124; SSE41-NEXT: movdqa %xmm3, %xmm0 2125; SSE41-NEXT: pxor %xmm2, %xmm0 2126; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] 2127; SSE41-NEXT: movdqa %xmm4, %xmm7 2128; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 2129; SSE41-NEXT: movdqa %xmm4, %xmm6 2130; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 2131; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 2132; SSE41-NEXT: pand %xmm7, %xmm0 2133; SSE41-NEXT: por %xmm6, %xmm0 2134; SSE41-NEXT: movapd %xmm1, %xmm8 2135; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 2136; SSE41-NEXT: movdqa %xmm5, %xmm0 2137; SSE41-NEXT: pxor %xmm2, %xmm0 2138; SSE41-NEXT: movdqa %xmm4, %xmm3 2139; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 2140; SSE41-NEXT: movdqa %xmm4, %xmm6 2141; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 2142; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 2143; SSE41-NEXT: pand %xmm3, %xmm0 2144; SSE41-NEXT: por %xmm6, %xmm0 2145; SSE41-NEXT: movapd %xmm1, %xmm6 2146; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 2147; SSE41-NEXT: movdqa %xmm10, %xmm0 2148; SSE41-NEXT: pxor %xmm2, %xmm0 2149; SSE41-NEXT: movdqa %xmm4, %xmm3 2150; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 2151; SSE41-NEXT: movdqa %xmm4, %xmm5 2152; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 2153; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 2154; SSE41-NEXT: pand %xmm3, %xmm0 2155; SSE41-NEXT: por %xmm5, %xmm0 2156; SSE41-NEXT: movapd %xmm1, %xmm3 2157; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 2158; SSE41-NEXT: movdqa %xmm9, %xmm0 2159; SSE41-NEXT: pxor %xmm2, %xmm0 2160; SSE41-NEXT: movdqa %xmm4, %xmm5 2161; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 2162; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 2163; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2164; SSE41-NEXT: pand %xmm5, %xmm0 2165; SSE41-NEXT: por %xmm4, %xmm0 2166; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 2167; SSE41-NEXT: pxor %xmm5, %xmm5 2168; SSE41-NEXT: movapd %xmm1, %xmm4 2169; SSE41-NEXT: xorpd %xmm2, %xmm4 2170; SSE41-NEXT: movapd %xmm4, %xmm7 2171; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 2172; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 2173; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2174; SSE41-NEXT: pand %xmm7, %xmm0 2175; SSE41-NEXT: por %xmm4, %xmm0 2176; SSE41-NEXT: pxor %xmm4, %xmm4 2177; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 2178; SSE41-NEXT: movapd %xmm3, %xmm1 2179; SSE41-NEXT: xorpd %xmm2, %xmm1 2180; SSE41-NEXT: movapd %xmm1, %xmm7 2181; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 2182; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 2183; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 2184; SSE41-NEXT: pand %xmm7, %xmm0 2185; SSE41-NEXT: por %xmm1, %xmm0 2186; SSE41-NEXT: pxor %xmm1, %xmm1 2187; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 2188; SSE41-NEXT: packusdw %xmm4, %xmm1 2189; SSE41-NEXT: movapd %xmm6, %xmm3 2190; SSE41-NEXT: xorpd %xmm2, %xmm3 2191; SSE41-NEXT: movapd %xmm3, %xmm4 2192; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 2193; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 2194; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 2195; SSE41-NEXT: pand %xmm4, %xmm0 2196; SSE41-NEXT: por %xmm3, %xmm0 2197; SSE41-NEXT: pxor %xmm3, %xmm3 2198; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 2199; SSE41-NEXT: movapd %xmm8, %xmm4 2200; SSE41-NEXT: xorpd %xmm2, %xmm4 2201; SSE41-NEXT: movapd %xmm4, %xmm6 2202; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 2203; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 2204; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2205; SSE41-NEXT: pand %xmm6, %xmm0 2206; SSE41-NEXT: por %xmm4, %xmm0 2207; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 2208; SSE41-NEXT: packusdw %xmm3, %xmm5 2209; SSE41-NEXT: packusdw %xmm5, %xmm1 2210; SSE41-NEXT: movdqa %xmm1, %xmm0 2211; SSE41-NEXT: retq 2212; 2213; AVX1-LABEL: trunc_packus_v8i64_v8i16: 2214; AVX1: # %bb.0: 2215; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2216; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 2217; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 2218; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 2219; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [65535,65535] 2220; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 2221; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 2222; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 2223; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 2224; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 2225; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 2226; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 2227; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 2228; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 2229; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 2230; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 2231; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 2232; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 2233; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2234; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 2235; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 2236; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 2237; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 2238; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 2239; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2240; AVX1-NEXT: retq 2241; 2242; AVX2-LABEL: trunc_packus_v8i64_v8i16: 2243; AVX2: # %bb.0: 2244; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2245; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 2246; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] 2247; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 2248; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 2249; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 2250; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 2251; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2252; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 2253; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 2254; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 2255; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 2256; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2257; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2258; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2259; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2260; AVX2-NEXT: vzeroupper 2261; AVX2-NEXT: retq 2262; 2263; AVX512-LABEL: trunc_packus_v8i64_v8i16: 2264; AVX512: # %bb.0: 2265; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 2266; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 2267; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 2268; AVX512-NEXT: vzeroupper 2269; AVX512-NEXT: retq 2270; 2271; SKX-LABEL: trunc_packus_v8i64_v8i16: 2272; SKX: # %bb.0: 2273; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 2274; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 2275; SKX-NEXT: vpmovusqw %ymm1, %xmm1 2276; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 2277; SKX-NEXT: vpmovusqw %ymm0, %xmm0 2278; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2279; SKX-NEXT: vzeroupper 2280; SKX-NEXT: retq 2281 %a0 = load <8 x i64>, ptr %p0 2282 %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> 2283 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535> 2284 %3 = icmp sgt <8 x i64> %2, zeroinitializer 2285 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 2286 %5 = trunc <8 x i64> %4 to <8 x i16> 2287 ret <8 x i16> %5 2288} 2289 2290define <4 x i16> @trunc_packus_v4i32_v4i16(<4 x i32> %a0) { 2291; SSE2-LABEL: trunc_packus_v4i32_v4i16: 2292; SSE2: # %bb.0: 2293; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2294; SSE2-NEXT: movdqa %xmm1, %xmm2 2295; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 2296; SSE2-NEXT: pand %xmm2, %xmm0 2297; SSE2-NEXT: pandn %xmm1, %xmm2 2298; SSE2-NEXT: por %xmm0, %xmm2 2299; SSE2-NEXT: pxor %xmm0, %xmm0 2300; SSE2-NEXT: movdqa %xmm2, %xmm1 2301; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 2302; SSE2-NEXT: pand %xmm2, %xmm1 2303; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 2304; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2305; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2306; SSE2-NEXT: retq 2307; 2308; SSSE3-LABEL: trunc_packus_v4i32_v4i16: 2309; SSSE3: # %bb.0: 2310; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2311; SSSE3-NEXT: movdqa %xmm1, %xmm2 2312; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 2313; SSSE3-NEXT: pand %xmm2, %xmm0 2314; SSSE3-NEXT: pandn %xmm1, %xmm2 2315; SSSE3-NEXT: por %xmm2, %xmm0 2316; SSSE3-NEXT: pxor %xmm1, %xmm1 2317; SSSE3-NEXT: movdqa %xmm0, %xmm2 2318; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 2319; SSSE3-NEXT: pand %xmm2, %xmm0 2320; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2321; SSSE3-NEXT: retq 2322; 2323; SSE41-LABEL: trunc_packus_v4i32_v4i16: 2324; SSE41: # %bb.0: 2325; SSE41-NEXT: packusdw %xmm0, %xmm0 2326; SSE41-NEXT: retq 2327; 2328; AVX-LABEL: trunc_packus_v4i32_v4i16: 2329; AVX: # %bb.0: 2330; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2331; AVX-NEXT: retq 2332; 2333; AVX512-LABEL: trunc_packus_v4i32_v4i16: 2334; AVX512: # %bb.0: 2335; AVX512-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2336; AVX512-NEXT: retq 2337; 2338; SKX-LABEL: trunc_packus_v4i32_v4i16: 2339; SKX: # %bb.0: 2340; SKX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2341; SKX-NEXT: retq 2342 %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> 2343 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> 2344 %3 = icmp sgt <4 x i32> %2, zeroinitializer 2345 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 2346 %5 = trunc <4 x i32> %4 to <4 x i16> 2347 ret <4 x i16> %5 2348} 2349 2350define void @trunc_packus_v4i32_v4i16_store(<4 x i32> %a0, ptr%p1) { 2351; SSE2-LABEL: trunc_packus_v4i32_v4i16_store: 2352; SSE2: # %bb.0: 2353; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2354; SSE2-NEXT: movdqa %xmm1, %xmm2 2355; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 2356; SSE2-NEXT: pand %xmm2, %xmm0 2357; SSE2-NEXT: pandn %xmm1, %xmm2 2358; SSE2-NEXT: por %xmm0, %xmm2 2359; SSE2-NEXT: pxor %xmm0, %xmm0 2360; SSE2-NEXT: movdqa %xmm2, %xmm1 2361; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 2362; SSE2-NEXT: pand %xmm2, %xmm1 2363; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] 2364; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2365; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2366; SSE2-NEXT: movq %xmm0, (%rdi) 2367; SSE2-NEXT: retq 2368; 2369; SSSE3-LABEL: trunc_packus_v4i32_v4i16_store: 2370; SSSE3: # %bb.0: 2371; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] 2372; SSSE3-NEXT: movdqa %xmm1, %xmm2 2373; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 2374; SSSE3-NEXT: pand %xmm2, %xmm0 2375; SSSE3-NEXT: pandn %xmm1, %xmm2 2376; SSSE3-NEXT: por %xmm0, %xmm2 2377; SSSE3-NEXT: pxor %xmm0, %xmm0 2378; SSSE3-NEXT: movdqa %xmm2, %xmm1 2379; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 2380; SSSE3-NEXT: pand %xmm2, %xmm1 2381; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u] 2382; SSSE3-NEXT: movq %xmm1, (%rdi) 2383; SSSE3-NEXT: retq 2384; 2385; SSE41-LABEL: trunc_packus_v4i32_v4i16_store: 2386; SSE41: # %bb.0: 2387; SSE41-NEXT: packusdw %xmm0, %xmm0 2388; SSE41-NEXT: movq %xmm0, (%rdi) 2389; SSE41-NEXT: retq 2390; 2391; AVX-LABEL: trunc_packus_v4i32_v4i16_store: 2392; AVX: # %bb.0: 2393; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2394; AVX-NEXT: vmovq %xmm0, (%rdi) 2395; AVX-NEXT: retq 2396; 2397; AVX512F-LABEL: trunc_packus_v4i32_v4i16_store: 2398; AVX512F: # %bb.0: 2399; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2400; AVX512F-NEXT: vmovq %xmm0, (%rdi) 2401; AVX512F-NEXT: retq 2402; 2403; AVX512VL-LABEL: trunc_packus_v4i32_v4i16_store: 2404; AVX512VL: # %bb.0: 2405; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2406; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 2407; AVX512VL-NEXT: vpmovusdw %xmm0, (%rdi) 2408; AVX512VL-NEXT: retq 2409; 2410; AVX512BW-LABEL: trunc_packus_v4i32_v4i16_store: 2411; AVX512BW: # %bb.0: 2412; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 2413; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 2414; AVX512BW-NEXT: retq 2415; 2416; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i16_store: 2417; AVX512BWVL: # %bb.0: 2418; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2419; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 2420; AVX512BWVL-NEXT: vpmovusdw %xmm0, (%rdi) 2421; AVX512BWVL-NEXT: retq 2422; 2423; SKX-LABEL: trunc_packus_v4i32_v4i16_store: 2424; SKX: # %bb.0: 2425; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2426; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 2427; SKX-NEXT: vpmovusdw %xmm0, (%rdi) 2428; SKX-NEXT: retq 2429 %1 = icmp slt <4 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535> 2430 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> 2431 %3 = icmp sgt <4 x i32> %2, zeroinitializer 2432 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 2433 %5 = trunc <4 x i32> %4 to <4 x i16> 2434 store <4 x i16> %5, ptr%p1 2435 ret void 2436} 2437 2438define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { 2439; SSE2-LABEL: trunc_packus_v8i32_v8i16: 2440; SSE2: # %bb.0: 2441; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 2442; SSE2-NEXT: movdqa %xmm2, %xmm3 2443; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 2444; SSE2-NEXT: pand %xmm3, %xmm1 2445; SSE2-NEXT: pandn %xmm2, %xmm3 2446; SSE2-NEXT: por %xmm1, %xmm3 2447; SSE2-NEXT: movdqa %xmm2, %xmm1 2448; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 2449; SSE2-NEXT: pand %xmm1, %xmm0 2450; SSE2-NEXT: pandn %xmm2, %xmm1 2451; SSE2-NEXT: por %xmm1, %xmm0 2452; SSE2-NEXT: pxor %xmm1, %xmm1 2453; SSE2-NEXT: movdqa %xmm0, %xmm2 2454; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 2455; SSE2-NEXT: pand %xmm2, %xmm0 2456; SSE2-NEXT: movdqa %xmm3, %xmm2 2457; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 2458; SSE2-NEXT: pand %xmm3, %xmm2 2459; SSE2-NEXT: pslld $16, %xmm2 2460; SSE2-NEXT: psrad $16, %xmm2 2461; SSE2-NEXT: pslld $16, %xmm0 2462; SSE2-NEXT: psrad $16, %xmm0 2463; SSE2-NEXT: packssdw %xmm2, %xmm0 2464; SSE2-NEXT: retq 2465; 2466; SSSE3-LABEL: trunc_packus_v8i32_v8i16: 2467; SSSE3: # %bb.0: 2468; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] 2469; SSSE3-NEXT: movdqa %xmm2, %xmm3 2470; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 2471; SSSE3-NEXT: pand %xmm3, %xmm1 2472; SSSE3-NEXT: pandn %xmm2, %xmm3 2473; SSSE3-NEXT: por %xmm1, %xmm3 2474; SSSE3-NEXT: movdqa %xmm2, %xmm1 2475; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 2476; SSSE3-NEXT: pand %xmm1, %xmm0 2477; SSSE3-NEXT: pandn %xmm2, %xmm1 2478; SSSE3-NEXT: por %xmm1, %xmm0 2479; SSSE3-NEXT: pxor %xmm1, %xmm1 2480; SSSE3-NEXT: movdqa %xmm0, %xmm2 2481; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 2482; SSSE3-NEXT: pand %xmm2, %xmm0 2483; SSSE3-NEXT: movdqa %xmm3, %xmm2 2484; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 2485; SSSE3-NEXT: pand %xmm3, %xmm2 2486; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] 2487; SSSE3-NEXT: pshufb %xmm1, %xmm2 2488; SSSE3-NEXT: pshufb %xmm1, %xmm0 2489; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2490; SSSE3-NEXT: retq 2491; 2492; SSE41-LABEL: trunc_packus_v8i32_v8i16: 2493; SSE41: # %bb.0: 2494; SSE41-NEXT: packusdw %xmm1, %xmm0 2495; SSE41-NEXT: retq 2496; 2497; AVX1-LABEL: trunc_packus_v8i32_v8i16: 2498; AVX1: # %bb.0: 2499; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2500; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2501; AVX1-NEXT: vzeroupper 2502; AVX1-NEXT: retq 2503; 2504; AVX2-LABEL: trunc_packus_v8i32_v8i16: 2505; AVX2: # %bb.0: 2506; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2507; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2508; AVX2-NEXT: vzeroupper 2509; AVX2-NEXT: retq 2510; 2511; AVX512F-LABEL: trunc_packus_v8i32_v8i16: 2512; AVX512F: # %bb.0: 2513; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 2514; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2515; AVX512F-NEXT: vzeroupper 2516; AVX512F-NEXT: retq 2517; 2518; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: 2519; AVX512VL: # %bb.0: 2520; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2521; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 2522; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 2523; AVX512VL-NEXT: vzeroupper 2524; AVX512VL-NEXT: retq 2525; 2526; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: 2527; AVX512BW: # %bb.0: 2528; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 2529; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2530; AVX512BW-NEXT: vzeroupper 2531; AVX512BW-NEXT: retq 2532; 2533; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: 2534; AVX512BWVL: # %bb.0: 2535; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2536; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 2537; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 2538; AVX512BWVL-NEXT: vzeroupper 2539; AVX512BWVL-NEXT: retq 2540; 2541; SKX-LABEL: trunc_packus_v8i32_v8i16: 2542; SKX: # %bb.0: 2543; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2544; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 2545; SKX-NEXT: vpmovusdw %ymm0, %xmm0 2546; SKX-NEXT: vzeroupper 2547; SKX-NEXT: retq 2548 %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2549 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2550 %3 = icmp sgt <8 x i32> %2, zeroinitializer 2551 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 2552 %5 = trunc <8 x i32> %4 to <8 x i16> 2553 ret <8 x i16> %5 2554} 2555 2556define <16 x i16> @trunc_packus_v16i32_v16i16(ptr %p0) "min-legal-vector-width"="256" { 2557; SSE2-LABEL: trunc_packus_v16i32_v16i16: 2558; SSE2: # %bb.0: 2559; SSE2-NEXT: movdqa (%rdi), %xmm1 2560; SSE2-NEXT: movdqa 16(%rdi), %xmm3 2561; SSE2-NEXT: movdqa 32(%rdi), %xmm0 2562; SSE2-NEXT: movdqa 48(%rdi), %xmm4 2563; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 2564; SSE2-NEXT: movdqa %xmm5, %xmm2 2565; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 2566; SSE2-NEXT: pand %xmm2, %xmm3 2567; SSE2-NEXT: pandn %xmm5, %xmm2 2568; SSE2-NEXT: por %xmm3, %xmm2 2569; SSE2-NEXT: movdqa %xmm5, %xmm3 2570; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 2571; SSE2-NEXT: pand %xmm3, %xmm1 2572; SSE2-NEXT: pandn %xmm5, %xmm3 2573; SSE2-NEXT: por %xmm1, %xmm3 2574; SSE2-NEXT: movdqa %xmm5, %xmm6 2575; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 2576; SSE2-NEXT: pand %xmm6, %xmm4 2577; SSE2-NEXT: pandn %xmm5, %xmm6 2578; SSE2-NEXT: por %xmm4, %xmm6 2579; SSE2-NEXT: movdqa %xmm5, %xmm4 2580; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 2581; SSE2-NEXT: pand %xmm4, %xmm0 2582; SSE2-NEXT: pandn %xmm5, %xmm4 2583; SSE2-NEXT: por %xmm0, %xmm4 2584; SSE2-NEXT: pxor %xmm5, %xmm5 2585; SSE2-NEXT: movdqa %xmm4, %xmm1 2586; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 2587; SSE2-NEXT: pand %xmm4, %xmm1 2588; SSE2-NEXT: movdqa %xmm6, %xmm4 2589; SSE2-NEXT: pcmpgtd %xmm5, %xmm4 2590; SSE2-NEXT: pand %xmm6, %xmm4 2591; SSE2-NEXT: movdqa %xmm3, %xmm0 2592; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 2593; SSE2-NEXT: pand %xmm3, %xmm0 2594; SSE2-NEXT: movdqa %xmm2, %xmm3 2595; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 2596; SSE2-NEXT: pand %xmm2, %xmm3 2597; SSE2-NEXT: pslld $16, %xmm3 2598; SSE2-NEXT: psrad $16, %xmm3 2599; SSE2-NEXT: pslld $16, %xmm0 2600; SSE2-NEXT: psrad $16, %xmm0 2601; SSE2-NEXT: packssdw %xmm3, %xmm0 2602; SSE2-NEXT: pslld $16, %xmm4 2603; SSE2-NEXT: psrad $16, %xmm4 2604; SSE2-NEXT: pslld $16, %xmm1 2605; SSE2-NEXT: psrad $16, %xmm1 2606; SSE2-NEXT: packssdw %xmm4, %xmm1 2607; SSE2-NEXT: retq 2608; 2609; SSSE3-LABEL: trunc_packus_v16i32_v16i16: 2610; SSSE3: # %bb.0: 2611; SSSE3-NEXT: movdqa (%rdi), %xmm1 2612; SSSE3-NEXT: movdqa 16(%rdi), %xmm3 2613; SSSE3-NEXT: movdqa 32(%rdi), %xmm0 2614; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 2615; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535] 2616; SSSE3-NEXT: movdqa %xmm5, %xmm2 2617; SSSE3-NEXT: pcmpgtd %xmm3, %xmm2 2618; SSSE3-NEXT: pand %xmm2, %xmm3 2619; SSSE3-NEXT: pandn %xmm5, %xmm2 2620; SSSE3-NEXT: por %xmm3, %xmm2 2621; SSSE3-NEXT: movdqa %xmm5, %xmm3 2622; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 2623; SSSE3-NEXT: pand %xmm3, %xmm1 2624; SSSE3-NEXT: pandn %xmm5, %xmm3 2625; SSSE3-NEXT: por %xmm1, %xmm3 2626; SSSE3-NEXT: movdqa %xmm5, %xmm6 2627; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 2628; SSSE3-NEXT: pand %xmm6, %xmm4 2629; SSSE3-NEXT: pandn %xmm5, %xmm6 2630; SSSE3-NEXT: por %xmm4, %xmm6 2631; SSSE3-NEXT: movdqa %xmm5, %xmm4 2632; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 2633; SSSE3-NEXT: pand %xmm4, %xmm0 2634; SSSE3-NEXT: pandn %xmm5, %xmm4 2635; SSSE3-NEXT: por %xmm0, %xmm4 2636; SSSE3-NEXT: pxor %xmm5, %xmm5 2637; SSSE3-NEXT: movdqa %xmm4, %xmm1 2638; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 2639; SSSE3-NEXT: pand %xmm4, %xmm1 2640; SSSE3-NEXT: movdqa %xmm6, %xmm4 2641; SSSE3-NEXT: pcmpgtd %xmm5, %xmm4 2642; SSSE3-NEXT: pand %xmm6, %xmm4 2643; SSSE3-NEXT: movdqa %xmm3, %xmm0 2644; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 2645; SSSE3-NEXT: pand %xmm3, %xmm0 2646; SSSE3-NEXT: movdqa %xmm2, %xmm3 2647; SSSE3-NEXT: pcmpgtd %xmm5, %xmm3 2648; SSSE3-NEXT: pand %xmm2, %xmm3 2649; SSSE3-NEXT: pslld $16, %xmm3 2650; SSSE3-NEXT: psrad $16, %xmm3 2651; SSSE3-NEXT: pslld $16, %xmm0 2652; SSSE3-NEXT: psrad $16, %xmm0 2653; SSSE3-NEXT: packssdw %xmm3, %xmm0 2654; SSSE3-NEXT: pslld $16, %xmm4 2655; SSSE3-NEXT: psrad $16, %xmm4 2656; SSSE3-NEXT: pslld $16, %xmm1 2657; SSSE3-NEXT: psrad $16, %xmm1 2658; SSSE3-NEXT: packssdw %xmm4, %xmm1 2659; SSSE3-NEXT: retq 2660; 2661; SSE41-LABEL: trunc_packus_v16i32_v16i16: 2662; SSE41: # %bb.0: 2663; SSE41-NEXT: movdqa (%rdi), %xmm0 2664; SSE41-NEXT: movdqa 32(%rdi), %xmm1 2665; SSE41-NEXT: packusdw 16(%rdi), %xmm0 2666; SSE41-NEXT: packusdw 48(%rdi), %xmm1 2667; SSE41-NEXT: retq 2668; 2669; AVX1-LABEL: trunc_packus_v16i32_v16i16: 2670; AVX1: # %bb.0: 2671; AVX1-NEXT: vmovdqa (%rdi), %xmm0 2672; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 2673; AVX1-NEXT: vpackusdw 48(%rdi), %xmm1, %xmm1 2674; AVX1-NEXT: vpackusdw 16(%rdi), %xmm0, %xmm0 2675; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 2676; AVX1-NEXT: retq 2677; 2678; AVX2-LABEL: trunc_packus_v16i32_v16i16: 2679; AVX2: # %bb.0: 2680; AVX2-NEXT: vmovdqa (%rdi), %ymm0 2681; AVX2-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 2682; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2683; AVX2-NEXT: retq 2684; 2685; AVX512-LABEL: trunc_packus_v16i32_v16i16: 2686; AVX512: # %bb.0: 2687; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 2688; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 2689; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 2690; AVX512-NEXT: retq 2691; 2692; SKX-LABEL: trunc_packus_v16i32_v16i16: 2693; SKX: # %bb.0: 2694; SKX-NEXT: vmovdqa (%rdi), %ymm0 2695; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 2696; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2697; SKX-NEXT: retq 2698 %a0 = load <16 x i32>, ptr %p0 2699 %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2700 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 2701 %3 = icmp sgt <16 x i32> %2, zeroinitializer 2702 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 2703 %5 = trunc <16 x i32> %4 to <16 x i16> 2704 ret <16 x i16> %5 2705} 2706 2707; 2708; PACKUS saturation truncation to vXi8 2709; 2710 2711define <2 x i8> @trunc_packus_v2i64_v2i8(<2 x i64> %a0) { 2712; SSE2-LABEL: trunc_packus_v2i64_v2i8: 2713; SSE2: # %bb.0: 2714; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2715; SSE2-NEXT: movdqa %xmm0, %xmm2 2716; SSE2-NEXT: pxor %xmm1, %xmm2 2717; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2718; SSE2-NEXT: pxor %xmm4, %xmm4 2719; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2720; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2721; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 2722; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2723; SSE2-NEXT: pand %xmm4, %xmm2 2724; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2725; SSE2-NEXT: por %xmm2, %xmm3 2726; SSE2-NEXT: pand %xmm3, %xmm0 2727; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2728; SSE2-NEXT: por %xmm3, %xmm0 2729; SSE2-NEXT: movdqa %xmm0, %xmm2 2730; SSE2-NEXT: pxor %xmm1, %xmm2 2731; SSE2-NEXT: movdqa %xmm2, %xmm3 2732; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 2733; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 2734; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2735; SSE2-NEXT: pand %xmm3, %xmm1 2736; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 2737; SSE2-NEXT: por %xmm1, %xmm2 2738; SSE2-NEXT: pand %xmm2, %xmm0 2739; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2740; SSE2-NEXT: packuswb %xmm0, %xmm0 2741; SSE2-NEXT: packuswb %xmm0, %xmm0 2742; SSE2-NEXT: packuswb %xmm0, %xmm0 2743; SSE2-NEXT: retq 2744; 2745; SSSE3-LABEL: trunc_packus_v2i64_v2i8: 2746; SSSE3: # %bb.0: 2747; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2748; SSSE3-NEXT: movdqa %xmm0, %xmm2 2749; SSSE3-NEXT: pxor %xmm1, %xmm2 2750; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2751; SSSE3-NEXT: pxor %xmm4, %xmm4 2752; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 2753; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2754; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 2755; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2756; SSSE3-NEXT: pand %xmm4, %xmm2 2757; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2758; SSSE3-NEXT: por %xmm2, %xmm3 2759; SSSE3-NEXT: pand %xmm3, %xmm0 2760; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2761; SSSE3-NEXT: por %xmm3, %xmm0 2762; SSSE3-NEXT: movdqa %xmm0, %xmm2 2763; SSSE3-NEXT: pxor %xmm1, %xmm2 2764; SSSE3-NEXT: movdqa %xmm2, %xmm3 2765; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 2766; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] 2767; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 2768; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2769; SSSE3-NEXT: pand %xmm4, %xmm1 2770; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 2771; SSSE3-NEXT: por %xmm1, %xmm2 2772; SSSE3-NEXT: pand %xmm2, %xmm0 2773; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2774; SSSE3-NEXT: retq 2775; 2776; SSE41-LABEL: trunc_packus_v2i64_v2i8: 2777; SSE41: # %bb.0: 2778; SSE41-NEXT: movdqa %xmm0, %xmm1 2779; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] 2780; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 2781; SSE41-NEXT: pxor %xmm3, %xmm0 2782; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] 2783; SSE41-NEXT: movdqa %xmm4, %xmm5 2784; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 2785; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 2786; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2787; SSE41-NEXT: pand %xmm5, %xmm0 2788; SSE41-NEXT: por %xmm4, %xmm0 2789; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 2790; SSE41-NEXT: xorpd %xmm1, %xmm1 2791; SSE41-NEXT: movapd %xmm2, %xmm4 2792; SSE41-NEXT: xorpd %xmm3, %xmm4 2793; SSE41-NEXT: movapd %xmm4, %xmm5 2794; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 2795; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2796; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2797; SSE41-NEXT: pand %xmm5, %xmm0 2798; SSE41-NEXT: por %xmm4, %xmm0 2799; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 2800; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2801; SSE41-NEXT: movdqa %xmm1, %xmm0 2802; SSE41-NEXT: retq 2803; 2804; AVX-LABEL: trunc_packus_v2i64_v2i8: 2805; AVX: # %bb.0: 2806; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] 2807; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2808; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 2809; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2810; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 2811; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 2812; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2813; AVX-NEXT: retq 2814; 2815; AVX512F-LABEL: trunc_packus_v2i64_v2i8: 2816; AVX512F: # %bb.0: 2817; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2818; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2819; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2820; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2821; AVX512F-NEXT: vzeroupper 2822; AVX512F-NEXT: retq 2823; 2824; AVX512VL-LABEL: trunc_packus_v2i64_v2i8: 2825; AVX512VL: # %bb.0: 2826; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2827; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2828; AVX512VL-NEXT: vpmovusqb %xmm0, %xmm0 2829; AVX512VL-NEXT: retq 2830; 2831; AVX512BW-LABEL: trunc_packus_v2i64_v2i8: 2832; AVX512BW: # %bb.0: 2833; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2834; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2835; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2836; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2837; AVX512BW-NEXT: vzeroupper 2838; AVX512BW-NEXT: retq 2839; 2840; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8: 2841; AVX512BWVL: # %bb.0: 2842; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2843; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2844; AVX512BWVL-NEXT: vpmovusqb %xmm0, %xmm0 2845; AVX512BWVL-NEXT: retq 2846; 2847; SKX-LABEL: trunc_packus_v2i64_v2i8: 2848; SKX: # %bb.0: 2849; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2850; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2851; SKX-NEXT: vpmovusqb %xmm0, %xmm0 2852; SKX-NEXT: retq 2853 %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255> 2854 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255> 2855 %3 = icmp sgt <2 x i64> %2, zeroinitializer 2856 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 2857 %5 = trunc <2 x i64> %4 to <2 x i8> 2858 ret <2 x i8> %5 2859} 2860 2861define void @trunc_packus_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) { 2862; SSE2-LABEL: trunc_packus_v2i64_v2i8_store: 2863; SSE2: # %bb.0: 2864; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2865; SSE2-NEXT: movdqa %xmm0, %xmm2 2866; SSE2-NEXT: pxor %xmm1, %xmm2 2867; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2868; SSE2-NEXT: pxor %xmm4, %xmm4 2869; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 2870; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2871; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 2872; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2873; SSE2-NEXT: pand %xmm4, %xmm2 2874; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2875; SSE2-NEXT: por %xmm2, %xmm3 2876; SSE2-NEXT: pand %xmm3, %xmm0 2877; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2878; SSE2-NEXT: por %xmm0, %xmm3 2879; SSE2-NEXT: movdqa %xmm3, %xmm0 2880; SSE2-NEXT: pxor %xmm1, %xmm0 2881; SSE2-NEXT: movdqa %xmm0, %xmm2 2882; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 2883; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 2884; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2885; SSE2-NEXT: pand %xmm2, %xmm0 2886; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2887; SSE2-NEXT: por %xmm0, %xmm1 2888; SSE2-NEXT: pand %xmm3, %xmm1 2889; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2890; SSE2-NEXT: packuswb %xmm1, %xmm1 2891; SSE2-NEXT: packuswb %xmm1, %xmm1 2892; SSE2-NEXT: packuswb %xmm1, %xmm1 2893; SSE2-NEXT: movd %xmm1, %eax 2894; SSE2-NEXT: movw %ax, (%rdi) 2895; SSE2-NEXT: retq 2896; 2897; SSSE3-LABEL: trunc_packus_v2i64_v2i8_store: 2898; SSSE3: # %bb.0: 2899; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 2900; SSSE3-NEXT: movdqa %xmm0, %xmm2 2901; SSSE3-NEXT: pxor %xmm1, %xmm2 2902; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 2903; SSSE3-NEXT: pxor %xmm4, %xmm4 2904; SSSE3-NEXT: pcmpeqd %xmm3, %xmm4 2905; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 2906; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 2907; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] 2908; SSSE3-NEXT: pand %xmm4, %xmm2 2909; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2910; SSSE3-NEXT: por %xmm2, %xmm3 2911; SSSE3-NEXT: pand %xmm3, %xmm0 2912; SSSE3-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2913; SSSE3-NEXT: por %xmm0, %xmm3 2914; SSSE3-NEXT: movdqa %xmm3, %xmm0 2915; SSSE3-NEXT: pxor %xmm1, %xmm0 2916; SSSE3-NEXT: movdqa %xmm0, %xmm2 2917; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 2918; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] 2919; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 2920; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 2921; SSSE3-NEXT: pand %xmm4, %xmm0 2922; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 2923; SSSE3-NEXT: por %xmm0, %xmm1 2924; SSSE3-NEXT: pand %xmm3, %xmm1 2925; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2926; SSSE3-NEXT: movd %xmm1, %eax 2927; SSSE3-NEXT: movw %ax, (%rdi) 2928; SSSE3-NEXT: retq 2929; 2930; SSE41-LABEL: trunc_packus_v2i64_v2i8_store: 2931; SSE41: # %bb.0: 2932; SSE41-NEXT: movdqa %xmm0, %xmm1 2933; SSE41-NEXT: movapd {{.*#+}} xmm2 = [255,255] 2934; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 2935; SSE41-NEXT: pxor %xmm3, %xmm0 2936; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] 2937; SSE41-NEXT: movdqa %xmm4, %xmm5 2938; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 2939; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 2940; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2941; SSE41-NEXT: pand %xmm5, %xmm0 2942; SSE41-NEXT: por %xmm4, %xmm0 2943; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 2944; SSE41-NEXT: xorpd %xmm1, %xmm1 2945; SSE41-NEXT: movapd %xmm2, %xmm4 2946; SSE41-NEXT: xorpd %xmm3, %xmm4 2947; SSE41-NEXT: movapd %xmm4, %xmm5 2948; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 2949; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 2950; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 2951; SSE41-NEXT: pand %xmm5, %xmm0 2952; SSE41-NEXT: por %xmm4, %xmm0 2953; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 2954; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2955; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) 2956; SSE41-NEXT: retq 2957; 2958; AVX-LABEL: trunc_packus_v2i64_v2i8_store: 2959; AVX: # %bb.0: 2960; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] 2961; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 2962; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 2963; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 2964; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 2965; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 2966; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] 2967; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) 2968; AVX-NEXT: retq 2969; 2970; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: 2971; AVX512F: # %bb.0: 2972; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2973; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2974; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2975; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 2976; AVX512F-NEXT: vpextrw $0, %xmm0, (%rdi) 2977; AVX512F-NEXT: vzeroupper 2978; AVX512F-NEXT: retq 2979; 2980; AVX512VL-LABEL: trunc_packus_v2i64_v2i8_store: 2981; AVX512VL: # %bb.0: 2982; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 2983; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 2984; AVX512VL-NEXT: vpmovusqb %xmm0, (%rdi) 2985; AVX512VL-NEXT: retq 2986; 2987; AVX512BW-LABEL: trunc_packus_v2i64_v2i8_store: 2988; AVX512BW: # %bb.0: 2989; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 2990; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 2991; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 2992; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 2993; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rdi) 2994; AVX512BW-NEXT: vzeroupper 2995; AVX512BW-NEXT: retq 2996; 2997; AVX512BWVL-LABEL: trunc_packus_v2i64_v2i8_store: 2998; AVX512BWVL: # %bb.0: 2999; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3000; AVX512BWVL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 3001; AVX512BWVL-NEXT: vpmovusqb %xmm0, (%rdi) 3002; AVX512BWVL-NEXT: retq 3003; 3004; SKX-LABEL: trunc_packus_v2i64_v2i8_store: 3005; SKX: # %bb.0: 3006; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 3007; SKX-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 3008; SKX-NEXT: vpmovusqb %xmm0, (%rdi) 3009; SKX-NEXT: retq 3010 %1 = icmp slt <2 x i64> %a0, <i64 255, i64 255> 3011 %2 = select <2 x i1> %1, <2 x i64> %a0, <2 x i64> <i64 255, i64 255> 3012 %3 = icmp sgt <2 x i64> %2, zeroinitializer 3013 %4 = select <2 x i1> %3, <2 x i64> %2, <2 x i64> zeroinitializer 3014 %5 = trunc <2 x i64> %4 to <2 x i8> 3015 store <2 x i8> %5, ptr%p1 3016 ret void 3017} 3018 3019define <4 x i8> @trunc_packus_v4i64_v4i8(<4 x i64> %a0) { 3020; SSE2-LABEL: trunc_packus_v4i64_v4i8: 3021; SSE2: # %bb.0: 3022; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3023; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 3024; SSE2-NEXT: movdqa %xmm1, %xmm4 3025; SSE2-NEXT: pxor %xmm3, %xmm4 3026; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 3027; SSE2-NEXT: pxor %xmm9, %xmm9 3028; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3029; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903] 3030; SSE2-NEXT: movdqa %xmm2, %xmm7 3031; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 3032; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 3033; SSE2-NEXT: pand %xmm5, %xmm6 3034; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] 3035; SSE2-NEXT: por %xmm6, %xmm4 3036; SSE2-NEXT: pand %xmm4, %xmm1 3037; SSE2-NEXT: pandn %xmm8, %xmm4 3038; SSE2-NEXT: por %xmm1, %xmm4 3039; SSE2-NEXT: movdqa %xmm0, %xmm1 3040; SSE2-NEXT: pxor %xmm3, %xmm1 3041; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3042; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3043; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 3044; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2] 3045; SSE2-NEXT: pand %xmm5, %xmm1 3046; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3047; SSE2-NEXT: por %xmm1, %xmm2 3048; SSE2-NEXT: pand %xmm2, %xmm0 3049; SSE2-NEXT: pandn %xmm8, %xmm2 3050; SSE2-NEXT: por %xmm2, %xmm0 3051; SSE2-NEXT: movdqa %xmm0, %xmm1 3052; SSE2-NEXT: pxor %xmm3, %xmm1 3053; SSE2-NEXT: movdqa %xmm1, %xmm2 3054; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 3055; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 3056; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3057; SSE2-NEXT: pand %xmm2, %xmm1 3058; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3059; SSE2-NEXT: por %xmm1, %xmm2 3060; SSE2-NEXT: movdqa %xmm4, %xmm1 3061; SSE2-NEXT: pxor %xmm3, %xmm1 3062; SSE2-NEXT: movdqa %xmm1, %xmm5 3063; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 3064; SSE2-NEXT: pcmpeqd %xmm3, %xmm1 3065; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3066; SSE2-NEXT: pand %xmm5, %xmm1 3067; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 3068; SSE2-NEXT: por %xmm1, %xmm3 3069; SSE2-NEXT: pand %xmm8, %xmm3 3070; SSE2-NEXT: pand %xmm4, %xmm3 3071; SSE2-NEXT: pand %xmm8, %xmm2 3072; SSE2-NEXT: pand %xmm2, %xmm0 3073; SSE2-NEXT: packuswb %xmm3, %xmm0 3074; SSE2-NEXT: packuswb %xmm0, %xmm0 3075; SSE2-NEXT: packuswb %xmm0, %xmm0 3076; SSE2-NEXT: retq 3077; 3078; SSSE3-LABEL: trunc_packus_v4i64_v4i8: 3079; SSSE3: # %bb.0: 3080; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3081; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 3082; SSSE3-NEXT: movdqa %xmm1, %xmm3 3083; SSSE3-NEXT: pxor %xmm2, %xmm3 3084; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 3085; SSSE3-NEXT: pxor %xmm9, %xmm9 3086; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 3087; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] 3088; SSSE3-NEXT: movdqa %xmm4, %xmm7 3089; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 3090; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 3091; SSSE3-NEXT: pand %xmm5, %xmm6 3092; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 3093; SSSE3-NEXT: por %xmm6, %xmm3 3094; SSSE3-NEXT: pand %xmm3, %xmm1 3095; SSSE3-NEXT: pandn %xmm8, %xmm3 3096; SSSE3-NEXT: por %xmm1, %xmm3 3097; SSSE3-NEXT: movdqa %xmm0, %xmm1 3098; SSSE3-NEXT: pxor %xmm2, %xmm1 3099; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3100; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 3101; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 3102; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 3103; SSSE3-NEXT: pand %xmm5, %xmm1 3104; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3105; SSSE3-NEXT: por %xmm1, %xmm4 3106; SSSE3-NEXT: pand %xmm4, %xmm0 3107; SSSE3-NEXT: pandn %xmm8, %xmm4 3108; SSSE3-NEXT: por %xmm4, %xmm0 3109; SSSE3-NEXT: movdqa %xmm0, %xmm1 3110; SSSE3-NEXT: pxor %xmm2, %xmm1 3111; SSSE3-NEXT: movdqa %xmm1, %xmm4 3112; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 3113; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3114; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 3115; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3116; SSSE3-NEXT: pand %xmm5, %xmm1 3117; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3118; SSSE3-NEXT: por %xmm1, %xmm4 3119; SSSE3-NEXT: pand %xmm4, %xmm0 3120; SSSE3-NEXT: movdqa %xmm3, %xmm1 3121; SSSE3-NEXT: pxor %xmm2, %xmm1 3122; SSSE3-NEXT: movdqa %xmm1, %xmm4 3123; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 3124; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3125; SSSE3-NEXT: pcmpeqd %xmm2, %xmm1 3126; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3127; SSSE3-NEXT: pand %xmm5, %xmm1 3128; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 3129; SSSE3-NEXT: por %xmm1, %xmm2 3130; SSSE3-NEXT: pand %xmm3, %xmm2 3131; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3132; SSSE3-NEXT: pshufb %xmm1, %xmm2 3133; SSSE3-NEXT: pshufb %xmm1, %xmm0 3134; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 3135; SSSE3-NEXT: retq 3136; 3137; SSE41-LABEL: trunc_packus_v4i64_v4i8: 3138; SSE41: # %bb.0: 3139; SSE41-NEXT: movdqa %xmm0, %xmm2 3140; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 3141; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 3142; SSE41-NEXT: movdqa %xmm1, %xmm0 3143; SSE41-NEXT: pxor %xmm3, %xmm0 3144; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 3145; SSE41-NEXT: movdqa %xmm6, %xmm5 3146; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 3147; SSE41-NEXT: movdqa %xmm6, %xmm7 3148; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 3149; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 3150; SSE41-NEXT: pand %xmm5, %xmm0 3151; SSE41-NEXT: por %xmm7, %xmm0 3152; SSE41-NEXT: movapd %xmm4, %xmm5 3153; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 3154; SSE41-NEXT: movdqa %xmm2, %xmm0 3155; SSE41-NEXT: pxor %xmm3, %xmm0 3156; SSE41-NEXT: movdqa %xmm6, %xmm1 3157; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 3158; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3159; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3160; SSE41-NEXT: pand %xmm1, %xmm0 3161; SSE41-NEXT: por %xmm6, %xmm0 3162; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 3163; SSE41-NEXT: xorpd %xmm2, %xmm2 3164; SSE41-NEXT: movapd %xmm4, %xmm1 3165; SSE41-NEXT: xorpd %xmm3, %xmm1 3166; SSE41-NEXT: movapd %xmm1, %xmm6 3167; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 3168; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 3169; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 3170; SSE41-NEXT: pand %xmm6, %xmm0 3171; SSE41-NEXT: por %xmm1, %xmm0 3172; SSE41-NEXT: pxor %xmm1, %xmm1 3173; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 3174; SSE41-NEXT: movapd %xmm5, %xmm4 3175; SSE41-NEXT: xorpd %xmm3, %xmm4 3176; SSE41-NEXT: movapd %xmm4, %xmm6 3177; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 3178; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 3179; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3180; SSE41-NEXT: pand %xmm6, %xmm0 3181; SSE41-NEXT: por %xmm4, %xmm0 3182; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm2 3183; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3184; SSE41-NEXT: pshufb %xmm0, %xmm2 3185; SSE41-NEXT: pshufb %xmm0, %xmm1 3186; SSE41-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 3187; SSE41-NEXT: movdqa %xmm1, %xmm0 3188; SSE41-NEXT: retq 3189; 3190; AVX1-LABEL: trunc_packus_v4i64_v4i8: 3191; AVX1: # %bb.0: 3192; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3193; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] 3194; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 3195; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 3196; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 3197; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 3198; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 3199; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 3200; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 3201; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 3202; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 3203; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3204; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3205; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3206; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3207; AVX1-NEXT: vzeroupper 3208; AVX1-NEXT: retq 3209; 3210; AVX2-LABEL: trunc_packus_v4i64_v4i8: 3211; AVX2: # %bb.0: 3212; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] 3213; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 3214; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 3215; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 3216; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 3217; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 3218; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3219; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3220; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3221; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3222; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3223; AVX2-NEXT: vzeroupper 3224; AVX2-NEXT: retq 3225; 3226; AVX512F-LABEL: trunc_packus_v4i64_v4i8: 3227; AVX512F: # %bb.0: 3228; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3229; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 3230; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 3231; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 3232; AVX512F-NEXT: vzeroupper 3233; AVX512F-NEXT: retq 3234; 3235; AVX512VL-LABEL: trunc_packus_v4i64_v4i8: 3236; AVX512VL: # %bb.0: 3237; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3238; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3239; AVX512VL-NEXT: vpmovusqb %ymm0, %xmm0 3240; AVX512VL-NEXT: vzeroupper 3241; AVX512VL-NEXT: retq 3242; 3243; AVX512BW-LABEL: trunc_packus_v4i64_v4i8: 3244; AVX512BW: # %bb.0: 3245; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3246; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3247; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 3248; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 3249; AVX512BW-NEXT: vzeroupper 3250; AVX512BW-NEXT: retq 3251; 3252; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8: 3253; AVX512BWVL: # %bb.0: 3254; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3255; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3256; AVX512BWVL-NEXT: vpmovusqb %ymm0, %xmm0 3257; AVX512BWVL-NEXT: vzeroupper 3258; AVX512BWVL-NEXT: retq 3259; 3260; SKX-LABEL: trunc_packus_v4i64_v4i8: 3261; SKX: # %bb.0: 3262; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 3263; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3264; SKX-NEXT: vpmovusqb %ymm0, %xmm0 3265; SKX-NEXT: vzeroupper 3266; SKX-NEXT: retq 3267 %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255> 3268 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255> 3269 %3 = icmp sgt <4 x i64> %2, zeroinitializer 3270 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 3271 %5 = trunc <4 x i64> %4 to <4 x i8> 3272 ret <4 x i8> %5 3273} 3274 3275define void @trunc_packus_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) { 3276; SSE2-LABEL: trunc_packus_v4i64_v4i8_store: 3277; SSE2: # %bb.0: 3278; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3279; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 3280; SSE2-NEXT: movdqa %xmm1, %xmm4 3281; SSE2-NEXT: pxor %xmm3, %xmm4 3282; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 3283; SSE2-NEXT: pxor %xmm9, %xmm9 3284; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3285; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903] 3286; SSE2-NEXT: movdqa %xmm2, %xmm7 3287; SSE2-NEXT: pcmpgtd %xmm4, %xmm7 3288; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 3289; SSE2-NEXT: pand %xmm5, %xmm6 3290; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] 3291; SSE2-NEXT: por %xmm6, %xmm4 3292; SSE2-NEXT: pand %xmm4, %xmm1 3293; SSE2-NEXT: pandn %xmm8, %xmm4 3294; SSE2-NEXT: por %xmm1, %xmm4 3295; SSE2-NEXT: movdqa %xmm0, %xmm1 3296; SSE2-NEXT: pxor %xmm3, %xmm1 3297; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3298; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3299; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 3300; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] 3301; SSE2-NEXT: pand %xmm5, %xmm6 3302; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] 3303; SSE2-NEXT: por %xmm6, %xmm1 3304; SSE2-NEXT: pand %xmm1, %xmm0 3305; SSE2-NEXT: pandn %xmm8, %xmm1 3306; SSE2-NEXT: por %xmm0, %xmm1 3307; SSE2-NEXT: movdqa %xmm1, %xmm0 3308; SSE2-NEXT: pxor %xmm3, %xmm0 3309; SSE2-NEXT: movdqa %xmm0, %xmm2 3310; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 3311; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 3312; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3313; SSE2-NEXT: pand %xmm2, %xmm0 3314; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 3315; SSE2-NEXT: por %xmm0, %xmm2 3316; SSE2-NEXT: movdqa %xmm4, %xmm0 3317; SSE2-NEXT: pxor %xmm3, %xmm0 3318; SSE2-NEXT: movdqa %xmm0, %xmm5 3319; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 3320; SSE2-NEXT: pcmpeqd %xmm3, %xmm0 3321; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3322; SSE2-NEXT: pand %xmm5, %xmm0 3323; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 3324; SSE2-NEXT: por %xmm0, %xmm3 3325; SSE2-NEXT: pand %xmm8, %xmm3 3326; SSE2-NEXT: pand %xmm4, %xmm3 3327; SSE2-NEXT: pand %xmm8, %xmm2 3328; SSE2-NEXT: pand %xmm1, %xmm2 3329; SSE2-NEXT: packuswb %xmm3, %xmm2 3330; SSE2-NEXT: packuswb %xmm2, %xmm2 3331; SSE2-NEXT: packuswb %xmm2, %xmm2 3332; SSE2-NEXT: movd %xmm2, (%rdi) 3333; SSE2-NEXT: retq 3334; 3335; SSSE3-LABEL: trunc_packus_v4i64_v4i8_store: 3336; SSSE3: # %bb.0: 3337; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3338; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 3339; SSSE3-NEXT: movdqa %xmm1, %xmm3 3340; SSSE3-NEXT: pxor %xmm2, %xmm3 3341; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 3342; SSSE3-NEXT: pxor %xmm9, %xmm9 3343; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 3344; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] 3345; SSSE3-NEXT: movdqa %xmm4, %xmm7 3346; SSSE3-NEXT: pcmpgtd %xmm3, %xmm7 3347; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] 3348; SSSE3-NEXT: pand %xmm5, %xmm6 3349; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] 3350; SSSE3-NEXT: por %xmm6, %xmm3 3351; SSSE3-NEXT: pand %xmm3, %xmm1 3352; SSSE3-NEXT: pandn %xmm8, %xmm3 3353; SSSE3-NEXT: por %xmm1, %xmm3 3354; SSSE3-NEXT: movdqa %xmm0, %xmm1 3355; SSSE3-NEXT: pxor %xmm2, %xmm1 3356; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3357; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 3358; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 3359; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,2,2] 3360; SSSE3-NEXT: pand %xmm5, %xmm1 3361; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3362; SSSE3-NEXT: por %xmm1, %xmm4 3363; SSSE3-NEXT: pand %xmm4, %xmm0 3364; SSSE3-NEXT: pandn %xmm8, %xmm4 3365; SSSE3-NEXT: por %xmm0, %xmm4 3366; SSSE3-NEXT: movdqa %xmm4, %xmm0 3367; SSSE3-NEXT: pxor %xmm2, %xmm0 3368; SSSE3-NEXT: movdqa %xmm0, %xmm1 3369; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 3370; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] 3371; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 3372; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3373; SSSE3-NEXT: pand %xmm5, %xmm0 3374; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3375; SSSE3-NEXT: por %xmm0, %xmm1 3376; SSSE3-NEXT: pand %xmm4, %xmm1 3377; SSSE3-NEXT: movdqa %xmm3, %xmm0 3378; SSSE3-NEXT: pxor %xmm2, %xmm0 3379; SSSE3-NEXT: movdqa %xmm0, %xmm4 3380; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 3381; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3382; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 3383; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3384; SSSE3-NEXT: pand %xmm5, %xmm0 3385; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] 3386; SSSE3-NEXT: por %xmm0, %xmm2 3387; SSSE3-NEXT: pand %xmm3, %xmm2 3388; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3389; SSSE3-NEXT: pshufb %xmm0, %xmm2 3390; SSSE3-NEXT: pshufb %xmm0, %xmm1 3391; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 3392; SSSE3-NEXT: movd %xmm1, (%rdi) 3393; SSSE3-NEXT: retq 3394; 3395; SSE41-LABEL: trunc_packus_v4i64_v4i8_store: 3396; SSE41: # %bb.0: 3397; SSE41-NEXT: movdqa %xmm0, %xmm2 3398; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 3399; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] 3400; SSE41-NEXT: movdqa %xmm1, %xmm0 3401; SSE41-NEXT: pxor %xmm3, %xmm0 3402; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 3403; SSE41-NEXT: movdqa %xmm6, %xmm5 3404; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 3405; SSE41-NEXT: movdqa %xmm6, %xmm7 3406; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 3407; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 3408; SSE41-NEXT: pand %xmm5, %xmm0 3409; SSE41-NEXT: por %xmm7, %xmm0 3410; SSE41-NEXT: movapd %xmm4, %xmm5 3411; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 3412; SSE41-NEXT: movdqa %xmm2, %xmm0 3413; SSE41-NEXT: pxor %xmm3, %xmm0 3414; SSE41-NEXT: movdqa %xmm6, %xmm1 3415; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 3416; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3417; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3418; SSE41-NEXT: pand %xmm1, %xmm0 3419; SSE41-NEXT: por %xmm6, %xmm0 3420; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 3421; SSE41-NEXT: pxor %xmm1, %xmm1 3422; SSE41-NEXT: movapd %xmm4, %xmm2 3423; SSE41-NEXT: xorpd %xmm3, %xmm2 3424; SSE41-NEXT: movapd %xmm2, %xmm6 3425; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 3426; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 3427; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 3428; SSE41-NEXT: pand %xmm6, %xmm0 3429; SSE41-NEXT: por %xmm2, %xmm0 3430; SSE41-NEXT: pxor %xmm2, %xmm2 3431; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 3432; SSE41-NEXT: movapd %xmm5, %xmm4 3433; SSE41-NEXT: xorpd %xmm3, %xmm4 3434; SSE41-NEXT: movapd %xmm4, %xmm6 3435; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 3436; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 3437; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3438; SSE41-NEXT: pand %xmm6, %xmm0 3439; SSE41-NEXT: por %xmm4, %xmm0 3440; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 3441; SSE41-NEXT: movdqa {{.*#+}} xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3442; SSE41-NEXT: pshufb %xmm0, %xmm1 3443; SSE41-NEXT: pshufb %xmm0, %xmm2 3444; SSE41-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 3445; SSE41-NEXT: movd %xmm2, (%rdi) 3446; SSE41-NEXT: retq 3447; 3448; AVX1-LABEL: trunc_packus_v4i64_v4i8_store: 3449; AVX1: # %bb.0: 3450; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3451; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] 3452; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 3453; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 3454; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 3455; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 3456; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 3457; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 3458; AVX1-NEXT: vpand %xmm0, %xmm3, %xmm0 3459; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2 3460; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 3461; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3462; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3463; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3464; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3465; AVX1-NEXT: vmovd %xmm0, (%rdi) 3466; AVX1-NEXT: vzeroupper 3467; AVX1-NEXT: retq 3468; 3469; AVX2-LABEL: trunc_packus_v4i64_v4i8_store: 3470; AVX2: # %bb.0: 3471; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] 3472; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 3473; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 3474; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 3475; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 3476; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 3477; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3478; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> 3479; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3480; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3481; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 3482; AVX2-NEXT: vmovd %xmm0, (%rdi) 3483; AVX2-NEXT: vzeroupper 3484; AVX2-NEXT: retq 3485; 3486; AVX512F-LABEL: trunc_packus_v4i64_v4i8_store: 3487; AVX512F: # %bb.0: 3488; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3489; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 3490; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 3491; AVX512F-NEXT: vpmovusqb %zmm0, %xmm0 3492; AVX512F-NEXT: vmovd %xmm0, (%rdi) 3493; AVX512F-NEXT: vzeroupper 3494; AVX512F-NEXT: retq 3495; 3496; AVX512VL-LABEL: trunc_packus_v4i64_v4i8_store: 3497; AVX512VL: # %bb.0: 3498; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3499; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3500; AVX512VL-NEXT: vpmovusqb %ymm0, (%rdi) 3501; AVX512VL-NEXT: vzeroupper 3502; AVX512VL-NEXT: retq 3503; 3504; AVX512BW-LABEL: trunc_packus_v4i64_v4i8_store: 3505; AVX512BW: # %bb.0: 3506; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3507; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 3508; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 3509; AVX512BW-NEXT: vpmovusqb %zmm0, %xmm0 3510; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 3511; AVX512BW-NEXT: vzeroupper 3512; AVX512BW-NEXT: retq 3513; 3514; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i8_store: 3515; AVX512BWVL: # %bb.0: 3516; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3517; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3518; AVX512BWVL-NEXT: vpmovusqb %ymm0, (%rdi) 3519; AVX512BWVL-NEXT: vzeroupper 3520; AVX512BWVL-NEXT: retq 3521; 3522; SKX-LABEL: trunc_packus_v4i64_v4i8_store: 3523; SKX: # %bb.0: 3524; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 3525; SKX-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 3526; SKX-NEXT: vpmovusqb %ymm0, (%rdi) 3527; SKX-NEXT: vzeroupper 3528; SKX-NEXT: retq 3529 %1 = icmp slt <4 x i64> %a0, <i64 255, i64 255, i64 255, i64 255> 3530 %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 255, i64 255, i64 255, i64 255> 3531 %3 = icmp sgt <4 x i64> %2, zeroinitializer 3532 %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer 3533 %5 = trunc <4 x i64> %4 to <4 x i8> 3534 store <4 x i8> %5, ptr%p1 3535 ret void 3536} 3537 3538define <8 x i8> @trunc_packus_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" { 3539; SSE2-LABEL: trunc_packus_v8i64_v8i8: 3540; SSE2: # %bb.0: 3541; SSE2-NEXT: movdqa (%rdi), %xmm5 3542; SSE2-NEXT: movdqa 16(%rdi), %xmm10 3543; SSE2-NEXT: movdqa 32(%rdi), %xmm3 3544; SSE2-NEXT: movdqa 48(%rdi), %xmm4 3545; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3546; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 3547; SSE2-NEXT: movdqa %xmm3, %xmm2 3548; SSE2-NEXT: pxor %xmm11, %xmm2 3549; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 3550; SSE2-NEXT: pxor %xmm9, %xmm9 3551; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 3552; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] 3553; SSE2-NEXT: movdqa %xmm0, %xmm6 3554; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 3555; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3556; SSE2-NEXT: pand %xmm7, %xmm1 3557; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 3558; SSE2-NEXT: por %xmm1, %xmm2 3559; SSE2-NEXT: pand %xmm2, %xmm3 3560; SSE2-NEXT: pandn %xmm8, %xmm2 3561; SSE2-NEXT: por %xmm3, %xmm2 3562; SSE2-NEXT: movdqa %xmm4, %xmm1 3563; SSE2-NEXT: pxor %xmm11, %xmm1 3564; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 3565; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 3566; SSE2-NEXT: movdqa %xmm0, %xmm6 3567; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 3568; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3569; SSE2-NEXT: pand %xmm3, %xmm1 3570; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] 3571; SSE2-NEXT: por %xmm1, %xmm3 3572; SSE2-NEXT: pand %xmm3, %xmm4 3573; SSE2-NEXT: pandn %xmm8, %xmm3 3574; SSE2-NEXT: por %xmm4, %xmm3 3575; SSE2-NEXT: movdqa %xmm5, %xmm1 3576; SSE2-NEXT: pxor %xmm11, %xmm1 3577; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 3578; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 3579; SSE2-NEXT: movdqa %xmm0, %xmm6 3580; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 3581; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3582; SSE2-NEXT: pand %xmm4, %xmm1 3583; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] 3584; SSE2-NEXT: por %xmm1, %xmm4 3585; SSE2-NEXT: pand %xmm4, %xmm5 3586; SSE2-NEXT: pandn %xmm8, %xmm4 3587; SSE2-NEXT: por %xmm5, %xmm4 3588; SSE2-NEXT: movdqa %xmm10, %xmm1 3589; SSE2-NEXT: pxor %xmm11, %xmm1 3590; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3591; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3592; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 3593; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] 3594; SSE2-NEXT: pand %xmm5, %xmm1 3595; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3596; SSE2-NEXT: por %xmm1, %xmm0 3597; SSE2-NEXT: pand %xmm0, %xmm10 3598; SSE2-NEXT: pandn %xmm8, %xmm0 3599; SSE2-NEXT: por %xmm10, %xmm0 3600; SSE2-NEXT: movdqa %xmm0, %xmm1 3601; SSE2-NEXT: pxor %xmm11, %xmm1 3602; SSE2-NEXT: movdqa %xmm1, %xmm5 3603; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 3604; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 3605; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 3606; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3607; SSE2-NEXT: pand %xmm6, %xmm1 3608; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 3609; SSE2-NEXT: por %xmm1, %xmm5 3610; SSE2-NEXT: pand %xmm0, %xmm5 3611; SSE2-NEXT: movdqa %xmm4, %xmm0 3612; SSE2-NEXT: pxor %xmm11, %xmm0 3613; SSE2-NEXT: movdqa %xmm0, %xmm1 3614; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 3615; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] 3616; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 3617; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] 3618; SSE2-NEXT: pand %xmm6, %xmm7 3619; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] 3620; SSE2-NEXT: por %xmm7, %xmm0 3621; SSE2-NEXT: pand %xmm4, %xmm0 3622; SSE2-NEXT: packuswb %xmm5, %xmm0 3623; SSE2-NEXT: movdqa %xmm3, %xmm1 3624; SSE2-NEXT: pxor %xmm11, %xmm1 3625; SSE2-NEXT: movdqa %xmm1, %xmm4 3626; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 3627; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3628; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 3629; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3630; SSE2-NEXT: pand %xmm5, %xmm1 3631; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3632; SSE2-NEXT: por %xmm1, %xmm4 3633; SSE2-NEXT: pand %xmm3, %xmm4 3634; SSE2-NEXT: movdqa %xmm2, %xmm1 3635; SSE2-NEXT: pxor %xmm11, %xmm1 3636; SSE2-NEXT: movdqa %xmm1, %xmm3 3637; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 3638; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] 3639; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 3640; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3641; SSE2-NEXT: pand %xmm5, %xmm1 3642; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3643; SSE2-NEXT: por %xmm1, %xmm3 3644; SSE2-NEXT: pand %xmm2, %xmm3 3645; SSE2-NEXT: packuswb %xmm4, %xmm3 3646; SSE2-NEXT: packuswb %xmm3, %xmm0 3647; SSE2-NEXT: packuswb %xmm0, %xmm0 3648; SSE2-NEXT: retq 3649; 3650; SSSE3-LABEL: trunc_packus_v8i64_v8i8: 3651; SSSE3: # %bb.0: 3652; SSSE3-NEXT: movdqa (%rdi), %xmm5 3653; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 3654; SSSE3-NEXT: movdqa 32(%rdi), %xmm3 3655; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 3656; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3657; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 3658; SSSE3-NEXT: movdqa %xmm3, %xmm2 3659; SSSE3-NEXT: pxor %xmm11, %xmm2 3660; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 3661; SSSE3-NEXT: pxor %xmm9, %xmm9 3662; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 3663; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [2147483903,2147483903] 3664; SSSE3-NEXT: movdqa %xmm0, %xmm6 3665; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 3666; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3667; SSSE3-NEXT: pand %xmm7, %xmm1 3668; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 3669; SSSE3-NEXT: por %xmm1, %xmm2 3670; SSSE3-NEXT: pand %xmm2, %xmm3 3671; SSSE3-NEXT: pandn %xmm8, %xmm2 3672; SSSE3-NEXT: por %xmm3, %xmm2 3673; SSSE3-NEXT: movdqa %xmm4, %xmm1 3674; SSSE3-NEXT: pxor %xmm11, %xmm1 3675; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 3676; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 3677; SSSE3-NEXT: movdqa %xmm0, %xmm6 3678; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 3679; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3680; SSSE3-NEXT: pand %xmm3, %xmm1 3681; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] 3682; SSSE3-NEXT: por %xmm1, %xmm3 3683; SSSE3-NEXT: pand %xmm3, %xmm4 3684; SSSE3-NEXT: pandn %xmm8, %xmm3 3685; SSSE3-NEXT: por %xmm4, %xmm3 3686; SSSE3-NEXT: movdqa %xmm5, %xmm1 3687; SSSE3-NEXT: pxor %xmm11, %xmm1 3688; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 3689; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 3690; SSSE3-NEXT: movdqa %xmm0, %xmm6 3691; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 3692; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2] 3693; SSSE3-NEXT: pand %xmm4, %xmm1 3694; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] 3695; SSSE3-NEXT: por %xmm1, %xmm4 3696; SSSE3-NEXT: pand %xmm4, %xmm5 3697; SSSE3-NEXT: pandn %xmm8, %xmm4 3698; SSSE3-NEXT: por %xmm5, %xmm4 3699; SSSE3-NEXT: movdqa %xmm10, %xmm1 3700; SSSE3-NEXT: pxor %xmm11, %xmm1 3701; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 3702; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 3703; SSSE3-NEXT: pcmpgtd %xmm1, %xmm0 3704; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2] 3705; SSSE3-NEXT: pand %xmm5, %xmm1 3706; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 3707; SSSE3-NEXT: por %xmm1, %xmm0 3708; SSSE3-NEXT: pand %xmm0, %xmm10 3709; SSSE3-NEXT: pandn %xmm8, %xmm0 3710; SSSE3-NEXT: por %xmm10, %xmm0 3711; SSSE3-NEXT: movdqa %xmm0, %xmm1 3712; SSSE3-NEXT: pxor %xmm11, %xmm1 3713; SSSE3-NEXT: movdqa %xmm1, %xmm5 3714; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 3715; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 3716; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 3717; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3718; SSSE3-NEXT: pand %xmm6, %xmm1 3719; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 3720; SSSE3-NEXT: por %xmm1, %xmm5 3721; SSSE3-NEXT: pand %xmm0, %xmm5 3722; SSSE3-NEXT: movdqa %xmm4, %xmm0 3723; SSSE3-NEXT: pxor %xmm11, %xmm0 3724; SSSE3-NEXT: movdqa %xmm0, %xmm1 3725; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 3726; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] 3727; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 3728; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] 3729; SSSE3-NEXT: pand %xmm6, %xmm7 3730; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] 3731; SSSE3-NEXT: por %xmm7, %xmm0 3732; SSSE3-NEXT: pand %xmm4, %xmm0 3733; SSSE3-NEXT: packuswb %xmm5, %xmm0 3734; SSSE3-NEXT: movdqa %xmm3, %xmm1 3735; SSSE3-NEXT: pxor %xmm11, %xmm1 3736; SSSE3-NEXT: movdqa %xmm1, %xmm4 3737; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 3738; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 3739; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 3740; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3741; SSSE3-NEXT: pand %xmm5, %xmm1 3742; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 3743; SSSE3-NEXT: por %xmm1, %xmm4 3744; SSSE3-NEXT: pand %xmm3, %xmm4 3745; SSSE3-NEXT: movdqa %xmm2, %xmm1 3746; SSSE3-NEXT: pxor %xmm11, %xmm1 3747; SSSE3-NEXT: movdqa %xmm1, %xmm3 3748; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 3749; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] 3750; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 3751; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 3752; SSSE3-NEXT: pand %xmm5, %xmm1 3753; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3754; SSSE3-NEXT: por %xmm1, %xmm3 3755; SSSE3-NEXT: pand %xmm2, %xmm3 3756; SSSE3-NEXT: packuswb %xmm4, %xmm3 3757; SSSE3-NEXT: packuswb %xmm3, %xmm0 3758; SSSE3-NEXT: packuswb %xmm0, %xmm0 3759; SSSE3-NEXT: retq 3760; 3761; SSE41-LABEL: trunc_packus_v8i64_v8i8: 3762; SSE41: # %bb.0: 3763; SSE41-NEXT: movdqa (%rdi), %xmm10 3764; SSE41-NEXT: movdqa 16(%rdi), %xmm9 3765; SSE41-NEXT: movdqa 32(%rdi), %xmm3 3766; SSE41-NEXT: movdqa 48(%rdi), %xmm5 3767; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] 3768; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 3769; SSE41-NEXT: movdqa %xmm3, %xmm0 3770; SSE41-NEXT: pxor %xmm2, %xmm0 3771; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] 3772; SSE41-NEXT: movdqa %xmm4, %xmm7 3773; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 3774; SSE41-NEXT: movdqa %xmm4, %xmm6 3775; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3776; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3777; SSE41-NEXT: pand %xmm7, %xmm0 3778; SSE41-NEXT: por %xmm6, %xmm0 3779; SSE41-NEXT: movapd %xmm1, %xmm8 3780; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 3781; SSE41-NEXT: movdqa %xmm5, %xmm0 3782; SSE41-NEXT: pxor %xmm2, %xmm0 3783; SSE41-NEXT: movdqa %xmm4, %xmm3 3784; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 3785; SSE41-NEXT: movdqa %xmm4, %xmm6 3786; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 3787; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3788; SSE41-NEXT: pand %xmm3, %xmm0 3789; SSE41-NEXT: por %xmm6, %xmm0 3790; SSE41-NEXT: movapd %xmm1, %xmm6 3791; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 3792; SSE41-NEXT: movdqa %xmm10, %xmm0 3793; SSE41-NEXT: pxor %xmm2, %xmm0 3794; SSE41-NEXT: movdqa %xmm4, %xmm3 3795; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 3796; SSE41-NEXT: movdqa %xmm4, %xmm5 3797; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 3798; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 3799; SSE41-NEXT: pand %xmm3, %xmm0 3800; SSE41-NEXT: por %xmm5, %xmm0 3801; SSE41-NEXT: movapd %xmm1, %xmm3 3802; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 3803; SSE41-NEXT: movdqa %xmm9, %xmm0 3804; SSE41-NEXT: pxor %xmm2, %xmm0 3805; SSE41-NEXT: movdqa %xmm4, %xmm5 3806; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 3807; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 3808; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3809; SSE41-NEXT: pand %xmm5, %xmm0 3810; SSE41-NEXT: por %xmm4, %xmm0 3811; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 3812; SSE41-NEXT: pxor %xmm5, %xmm5 3813; SSE41-NEXT: movapd %xmm1, %xmm4 3814; SSE41-NEXT: xorpd %xmm2, %xmm4 3815; SSE41-NEXT: movapd %xmm4, %xmm7 3816; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3817; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 3818; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3819; SSE41-NEXT: pand %xmm7, %xmm0 3820; SSE41-NEXT: por %xmm4, %xmm0 3821; SSE41-NEXT: pxor %xmm4, %xmm4 3822; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 3823; SSE41-NEXT: movapd %xmm3, %xmm1 3824; SSE41-NEXT: xorpd %xmm2, %xmm1 3825; SSE41-NEXT: movapd %xmm1, %xmm7 3826; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 3827; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 3828; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 3829; SSE41-NEXT: pand %xmm7, %xmm0 3830; SSE41-NEXT: por %xmm1, %xmm0 3831; SSE41-NEXT: pxor %xmm1, %xmm1 3832; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 3833; SSE41-NEXT: packusdw %xmm4, %xmm1 3834; SSE41-NEXT: movapd %xmm6, %xmm3 3835; SSE41-NEXT: xorpd %xmm2, %xmm3 3836; SSE41-NEXT: movapd %xmm3, %xmm4 3837; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 3838; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 3839; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 3840; SSE41-NEXT: pand %xmm4, %xmm0 3841; SSE41-NEXT: por %xmm3, %xmm0 3842; SSE41-NEXT: pxor %xmm3, %xmm3 3843; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 3844; SSE41-NEXT: movapd %xmm8, %xmm4 3845; SSE41-NEXT: xorpd %xmm2, %xmm4 3846; SSE41-NEXT: movapd %xmm4, %xmm6 3847; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 3848; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 3849; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 3850; SSE41-NEXT: pand %xmm6, %xmm0 3851; SSE41-NEXT: por %xmm4, %xmm0 3852; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 3853; SSE41-NEXT: packusdw %xmm3, %xmm5 3854; SSE41-NEXT: packusdw %xmm5, %xmm1 3855; SSE41-NEXT: packuswb %xmm1, %xmm1 3856; SSE41-NEXT: movdqa %xmm1, %xmm0 3857; SSE41-NEXT: retq 3858; 3859; AVX1-LABEL: trunc_packus_v8i64_v8i8: 3860; AVX1: # %bb.0: 3861; AVX1-NEXT: vmovdqa (%rdi), %xmm0 3862; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 3863; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 3864; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 3865; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 3866; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 3867; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 3868; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 3869; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 3870; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 3871; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 3872; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 3873; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 3874; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 3875; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 3876; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 3877; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 3878; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 3879; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3880; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 3881; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 3882; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 3883; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 3884; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 3885; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3886; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3887; AVX1-NEXT: retq 3888; 3889; AVX2-LABEL: trunc_packus_v8i64_v8i8: 3890; AVX2: # %bb.0: 3891; AVX2-NEXT: vmovdqa (%rdi), %ymm0 3892; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 3893; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 3894; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 3895; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 3896; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 3897; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 3898; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3899; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 3900; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 3901; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 3902; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 3903; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3904; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3905; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3906; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3907; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 3908; AVX2-NEXT: vzeroupper 3909; AVX2-NEXT: retq 3910; 3911; AVX512-LABEL: trunc_packus_v8i64_v8i8: 3912; AVX512: # %bb.0: 3913; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 3914; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 3915; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 3916; AVX512-NEXT: vzeroupper 3917; AVX512-NEXT: retq 3918; 3919; SKX-LABEL: trunc_packus_v8i64_v8i8: 3920; SKX: # %bb.0: 3921; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3922; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 3923; SKX-NEXT: vpmovusqb %ymm1, %xmm1 3924; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 3925; SKX-NEXT: vpmovusqb %ymm0, %xmm0 3926; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3927; SKX-NEXT: vzeroupper 3928; SKX-NEXT: retq 3929 %a0 = load <8 x i64>, ptr %p0 3930 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3931 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 3932 %3 = icmp sgt <8 x i64> %2, zeroinitializer 3933 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 3934 %5 = trunc <8 x i64> %4 to <8 x i8> 3935 ret <8 x i8> %5 3936} 3937 3938define void @trunc_packus_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-width"="256" { 3939; SSE2-LABEL: trunc_packus_v8i64_v8i8_store: 3940; SSE2: # %bb.0: 3941; SSE2-NEXT: movdqa (%rdi), %xmm5 3942; SSE2-NEXT: movdqa 16(%rdi), %xmm10 3943; SSE2-NEXT: movdqa 32(%rdi), %xmm2 3944; SSE2-NEXT: movdqa 48(%rdi), %xmm4 3945; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 3946; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 3947; SSE2-NEXT: movdqa %xmm2, %xmm1 3948; SSE2-NEXT: pxor %xmm11, %xmm1 3949; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 3950; SSE2-NEXT: pxor %xmm9, %xmm9 3951; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 3952; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 3953; SSE2-NEXT: movdqa %xmm3, %xmm6 3954; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 3955; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3956; SSE2-NEXT: pand %xmm7, %xmm0 3957; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] 3958; SSE2-NEXT: por %xmm0, %xmm1 3959; SSE2-NEXT: pand %xmm1, %xmm2 3960; SSE2-NEXT: pandn %xmm8, %xmm1 3961; SSE2-NEXT: por %xmm2, %xmm1 3962; SSE2-NEXT: movdqa %xmm4, %xmm0 3963; SSE2-NEXT: pxor %xmm11, %xmm0 3964; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 3965; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 3966; SSE2-NEXT: movdqa %xmm3, %xmm6 3967; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 3968; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3969; SSE2-NEXT: pand %xmm2, %xmm0 3970; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 3971; SSE2-NEXT: por %xmm0, %xmm2 3972; SSE2-NEXT: pand %xmm2, %xmm4 3973; SSE2-NEXT: pandn %xmm8, %xmm2 3974; SSE2-NEXT: por %xmm4, %xmm2 3975; SSE2-NEXT: movdqa %xmm5, %xmm0 3976; SSE2-NEXT: pxor %xmm11, %xmm0 3977; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 3978; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 3979; SSE2-NEXT: movdqa %xmm3, %xmm6 3980; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 3981; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 3982; SSE2-NEXT: pand %xmm4, %xmm0 3983; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] 3984; SSE2-NEXT: por %xmm0, %xmm4 3985; SSE2-NEXT: pand %xmm4, %xmm5 3986; SSE2-NEXT: pandn %xmm8, %xmm4 3987; SSE2-NEXT: por %xmm5, %xmm4 3988; SSE2-NEXT: movdqa %xmm10, %xmm0 3989; SSE2-NEXT: pxor %xmm11, %xmm0 3990; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 3991; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 3992; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 3993; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 3994; SSE2-NEXT: pand %xmm5, %xmm0 3995; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 3996; SSE2-NEXT: por %xmm0, %xmm3 3997; SSE2-NEXT: pand %xmm3, %xmm10 3998; SSE2-NEXT: pandn %xmm8, %xmm3 3999; SSE2-NEXT: por %xmm10, %xmm3 4000; SSE2-NEXT: movdqa %xmm3, %xmm0 4001; SSE2-NEXT: pxor %xmm11, %xmm0 4002; SSE2-NEXT: movdqa %xmm0, %xmm5 4003; SSE2-NEXT: pcmpgtd %xmm11, %xmm5 4004; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4005; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 4006; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4007; SSE2-NEXT: pand %xmm6, %xmm0 4008; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 4009; SSE2-NEXT: por %xmm0, %xmm5 4010; SSE2-NEXT: pand %xmm3, %xmm5 4011; SSE2-NEXT: movdqa %xmm4, %xmm0 4012; SSE2-NEXT: pxor %xmm11, %xmm0 4013; SSE2-NEXT: movdqa %xmm0, %xmm3 4014; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 4015; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4016; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 4017; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4018; SSE2-NEXT: pand %xmm6, %xmm0 4019; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4020; SSE2-NEXT: por %xmm0, %xmm3 4021; SSE2-NEXT: pand %xmm4, %xmm3 4022; SSE2-NEXT: packuswb %xmm5, %xmm3 4023; SSE2-NEXT: movdqa %xmm2, %xmm0 4024; SSE2-NEXT: pxor %xmm11, %xmm0 4025; SSE2-NEXT: movdqa %xmm0, %xmm4 4026; SSE2-NEXT: pcmpgtd %xmm11, %xmm4 4027; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 4028; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 4029; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4030; SSE2-NEXT: pand %xmm5, %xmm0 4031; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4032; SSE2-NEXT: por %xmm0, %xmm4 4033; SSE2-NEXT: pand %xmm2, %xmm4 4034; SSE2-NEXT: movdqa %xmm1, %xmm0 4035; SSE2-NEXT: pxor %xmm11, %xmm0 4036; SSE2-NEXT: movdqa %xmm0, %xmm2 4037; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 4038; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] 4039; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 4040; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4041; SSE2-NEXT: pand %xmm5, %xmm0 4042; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4043; SSE2-NEXT: por %xmm0, %xmm2 4044; SSE2-NEXT: pand %xmm1, %xmm2 4045; SSE2-NEXT: packuswb %xmm4, %xmm2 4046; SSE2-NEXT: packuswb %xmm2, %xmm3 4047; SSE2-NEXT: packuswb %xmm3, %xmm3 4048; SSE2-NEXT: movq %xmm3, (%rsi) 4049; SSE2-NEXT: retq 4050; 4051; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: 4052; SSSE3: # %bb.0: 4053; SSSE3-NEXT: movdqa (%rdi), %xmm5 4054; SSSE3-NEXT: movdqa 16(%rdi), %xmm10 4055; SSSE3-NEXT: movdqa 32(%rdi), %xmm2 4056; SSSE3-NEXT: movdqa 48(%rdi), %xmm4 4057; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 4058; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648] 4059; SSSE3-NEXT: movdqa %xmm2, %xmm1 4060; SSSE3-NEXT: pxor %xmm11, %xmm1 4061; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 4062; SSSE3-NEXT: pxor %xmm9, %xmm9 4063; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 4064; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 4065; SSSE3-NEXT: movdqa %xmm3, %xmm6 4066; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 4067; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4068; SSSE3-NEXT: pand %xmm7, %xmm0 4069; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] 4070; SSSE3-NEXT: por %xmm0, %xmm1 4071; SSSE3-NEXT: pand %xmm1, %xmm2 4072; SSSE3-NEXT: pandn %xmm8, %xmm1 4073; SSSE3-NEXT: por %xmm2, %xmm1 4074; SSSE3-NEXT: movdqa %xmm4, %xmm0 4075; SSSE3-NEXT: pxor %xmm11, %xmm0 4076; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4077; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 4078; SSSE3-NEXT: movdqa %xmm3, %xmm6 4079; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 4080; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4081; SSSE3-NEXT: pand %xmm2, %xmm0 4082; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 4083; SSSE3-NEXT: por %xmm0, %xmm2 4084; SSSE3-NEXT: pand %xmm2, %xmm4 4085; SSSE3-NEXT: pandn %xmm8, %xmm2 4086; SSSE3-NEXT: por %xmm4, %xmm2 4087; SSSE3-NEXT: movdqa %xmm5, %xmm0 4088; SSSE3-NEXT: pxor %xmm11, %xmm0 4089; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 4090; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 4091; SSSE3-NEXT: movdqa %xmm3, %xmm6 4092; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 4093; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4094; SSSE3-NEXT: pand %xmm4, %xmm0 4095; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] 4096; SSSE3-NEXT: por %xmm0, %xmm4 4097; SSSE3-NEXT: pand %xmm4, %xmm5 4098; SSSE3-NEXT: pandn %xmm8, %xmm4 4099; SSSE3-NEXT: por %xmm5, %xmm4 4100; SSSE3-NEXT: movdqa %xmm10, %xmm0 4101; SSSE3-NEXT: pxor %xmm11, %xmm0 4102; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 4103; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 4104; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 4105; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4106; SSSE3-NEXT: pand %xmm5, %xmm0 4107; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4108; SSSE3-NEXT: por %xmm0, %xmm3 4109; SSSE3-NEXT: pand %xmm3, %xmm10 4110; SSSE3-NEXT: pandn %xmm8, %xmm3 4111; SSSE3-NEXT: por %xmm10, %xmm3 4112; SSSE3-NEXT: movdqa %xmm3, %xmm0 4113; SSSE3-NEXT: pxor %xmm11, %xmm0 4114; SSSE3-NEXT: movdqa %xmm0, %xmm5 4115; SSSE3-NEXT: pcmpgtd %xmm11, %xmm5 4116; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4117; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 4118; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4119; SSSE3-NEXT: pand %xmm6, %xmm0 4120; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 4121; SSSE3-NEXT: por %xmm0, %xmm5 4122; SSSE3-NEXT: pand %xmm3, %xmm5 4123; SSSE3-NEXT: movdqa %xmm4, %xmm0 4124; SSSE3-NEXT: pxor %xmm11, %xmm0 4125; SSSE3-NEXT: movdqa %xmm0, %xmm3 4126; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 4127; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4128; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 4129; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4130; SSSE3-NEXT: pand %xmm6, %xmm0 4131; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4132; SSSE3-NEXT: por %xmm0, %xmm3 4133; SSSE3-NEXT: pand %xmm4, %xmm3 4134; SSSE3-NEXT: packuswb %xmm5, %xmm3 4135; SSSE3-NEXT: movdqa %xmm2, %xmm0 4136; SSSE3-NEXT: pxor %xmm11, %xmm0 4137; SSSE3-NEXT: movdqa %xmm0, %xmm4 4138; SSSE3-NEXT: pcmpgtd %xmm11, %xmm4 4139; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 4140; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 4141; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4142; SSSE3-NEXT: pand %xmm5, %xmm0 4143; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4144; SSSE3-NEXT: por %xmm0, %xmm4 4145; SSSE3-NEXT: pand %xmm2, %xmm4 4146; SSSE3-NEXT: movdqa %xmm1, %xmm0 4147; SSSE3-NEXT: pxor %xmm11, %xmm0 4148; SSSE3-NEXT: movdqa %xmm0, %xmm2 4149; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 4150; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2] 4151; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 4152; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4153; SSSE3-NEXT: pand %xmm5, %xmm0 4154; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4155; SSSE3-NEXT: por %xmm0, %xmm2 4156; SSSE3-NEXT: pand %xmm1, %xmm2 4157; SSSE3-NEXT: packuswb %xmm4, %xmm2 4158; SSSE3-NEXT: packuswb %xmm2, %xmm3 4159; SSSE3-NEXT: packuswb %xmm3, %xmm3 4160; SSSE3-NEXT: movq %xmm3, (%rsi) 4161; SSSE3-NEXT: retq 4162; 4163; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: 4164; SSE41: # %bb.0: 4165; SSE41-NEXT: movdqa (%rdi), %xmm10 4166; SSE41-NEXT: movdqa 16(%rdi), %xmm9 4167; SSE41-NEXT: movdqa 32(%rdi), %xmm2 4168; SSE41-NEXT: movdqa 48(%rdi), %xmm5 4169; SSE41-NEXT: movapd {{.*#+}} xmm4 = [255,255] 4170; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 4171; SSE41-NEXT: movdqa %xmm2, %xmm0 4172; SSE41-NEXT: pxor %xmm1, %xmm0 4173; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483903,2147483903] 4174; SSE41-NEXT: movdqa %xmm3, %xmm7 4175; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 4176; SSE41-NEXT: movdqa %xmm3, %xmm6 4177; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 4178; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4179; SSE41-NEXT: pand %xmm7, %xmm0 4180; SSE41-NEXT: por %xmm6, %xmm0 4181; SSE41-NEXT: movapd %xmm4, %xmm8 4182; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 4183; SSE41-NEXT: movdqa %xmm5, %xmm0 4184; SSE41-NEXT: pxor %xmm1, %xmm0 4185; SSE41-NEXT: movdqa %xmm3, %xmm2 4186; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 4187; SSE41-NEXT: movdqa %xmm3, %xmm6 4188; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 4189; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4190; SSE41-NEXT: pand %xmm2, %xmm0 4191; SSE41-NEXT: por %xmm6, %xmm0 4192; SSE41-NEXT: movapd %xmm4, %xmm6 4193; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm6 4194; SSE41-NEXT: movdqa %xmm10, %xmm0 4195; SSE41-NEXT: pxor %xmm1, %xmm0 4196; SSE41-NEXT: movdqa %xmm3, %xmm2 4197; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 4198; SSE41-NEXT: movdqa %xmm3, %xmm5 4199; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 4200; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4201; SSE41-NEXT: pand %xmm2, %xmm0 4202; SSE41-NEXT: por %xmm5, %xmm0 4203; SSE41-NEXT: movapd %xmm4, %xmm2 4204; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 4205; SSE41-NEXT: movdqa %xmm9, %xmm0 4206; SSE41-NEXT: pxor %xmm1, %xmm0 4207; SSE41-NEXT: movdqa %xmm3, %xmm5 4208; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 4209; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 4210; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4211; SSE41-NEXT: pand %xmm5, %xmm0 4212; SSE41-NEXT: por %xmm3, %xmm0 4213; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 4214; SSE41-NEXT: pxor %xmm5, %xmm5 4215; SSE41-NEXT: movapd %xmm4, %xmm3 4216; SSE41-NEXT: xorpd %xmm1, %xmm3 4217; SSE41-NEXT: movapd %xmm3, %xmm7 4218; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 4219; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 4220; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4221; SSE41-NEXT: pand %xmm7, %xmm0 4222; SSE41-NEXT: por %xmm3, %xmm0 4223; SSE41-NEXT: pxor %xmm3, %xmm3 4224; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 4225; SSE41-NEXT: movapd %xmm2, %xmm4 4226; SSE41-NEXT: xorpd %xmm1, %xmm4 4227; SSE41-NEXT: movapd %xmm4, %xmm7 4228; SSE41-NEXT: pcmpeqd %xmm1, %xmm7 4229; SSE41-NEXT: pcmpgtd %xmm1, %xmm4 4230; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4231; SSE41-NEXT: pand %xmm7, %xmm0 4232; SSE41-NEXT: por %xmm4, %xmm0 4233; SSE41-NEXT: pxor %xmm4, %xmm4 4234; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 4235; SSE41-NEXT: packusdw %xmm3, %xmm4 4236; SSE41-NEXT: movapd %xmm6, %xmm2 4237; SSE41-NEXT: xorpd %xmm1, %xmm2 4238; SSE41-NEXT: movapd %xmm2, %xmm3 4239; SSE41-NEXT: pcmpeqd %xmm1, %xmm3 4240; SSE41-NEXT: pcmpgtd %xmm1, %xmm2 4241; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 4242; SSE41-NEXT: pand %xmm3, %xmm0 4243; SSE41-NEXT: por %xmm2, %xmm0 4244; SSE41-NEXT: pxor %xmm2, %xmm2 4245; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm2 4246; SSE41-NEXT: movapd %xmm8, %xmm3 4247; SSE41-NEXT: xorpd %xmm1, %xmm3 4248; SSE41-NEXT: movapd %xmm3, %xmm6 4249; SSE41-NEXT: pcmpeqd %xmm1, %xmm6 4250; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 4251; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4252; SSE41-NEXT: pand %xmm6, %xmm0 4253; SSE41-NEXT: por %xmm3, %xmm0 4254; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm5 4255; SSE41-NEXT: packusdw %xmm2, %xmm5 4256; SSE41-NEXT: packusdw %xmm5, %xmm4 4257; SSE41-NEXT: packuswb %xmm4, %xmm4 4258; SSE41-NEXT: movq %xmm4, (%rsi) 4259; SSE41-NEXT: retq 4260; 4261; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: 4262; AVX1: # %bb.0: 4263; AVX1-NEXT: vmovdqa (%rdi), %xmm0 4264; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 4265; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 4266; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 4267; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 4268; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm5 4269; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2 4270; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 4271; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3 4272; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 4273; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 4274; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5 4275; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 4276; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 4277; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 4278; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 4279; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 4280; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 4281; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4282; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1 4283; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 4284; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 4285; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 4286; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 4287; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4288; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4289; AVX1-NEXT: vmovq %xmm0, (%rsi) 4290; AVX1-NEXT: retq 4291; 4292; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: 4293; AVX2: # %bb.0: 4294; AVX2-NEXT: vmovdqa (%rdi), %ymm0 4295; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 4296; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] 4297; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 4298; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 4299; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 4300; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 4301; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4302; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 4303; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 4304; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 4305; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 4306; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4307; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4308; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4309; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4310; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 4311; AVX2-NEXT: vmovq %xmm0, (%rsi) 4312; AVX2-NEXT: vzeroupper 4313; AVX2-NEXT: retq 4314; 4315; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: 4316; AVX512: # %bb.0: 4317; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 4318; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 4319; AVX512-NEXT: vpmovusqb %zmm0, (%rsi) 4320; AVX512-NEXT: vzeroupper 4321; AVX512-NEXT: retq 4322; 4323; SKX-LABEL: trunc_packus_v8i64_v8i8_store: 4324; SKX: # %bb.0: 4325; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 4326; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm1 4327; SKX-NEXT: vpmovusqb %ymm1, %xmm1 4328; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 4329; SKX-NEXT: vpmovusqb %ymm0, %xmm0 4330; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4331; SKX-NEXT: vmovq %xmm0, (%rsi) 4332; SKX-NEXT: vzeroupper 4333; SKX-NEXT: retq 4334 %a0 = load <8 x i64>, ptr %p0 4335 %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 4336 %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 4337 %3 = icmp sgt <8 x i64> %2, zeroinitializer 4338 %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer 4339 %5 = trunc <8 x i64> %4 to <8 x i8> 4340 store <8 x i8> %5, ptr%p1 4341 ret void 4342} 4343 4344define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256" { 4345; SSE2-LABEL: trunc_packus_v16i64_v16i8: 4346; SSE2: # %bb.0: 4347; SSE2-NEXT: movdqa (%rdi), %xmm11 4348; SSE2-NEXT: movdqa 16(%rdi), %xmm9 4349; SSE2-NEXT: movdqa 32(%rdi), %xmm15 4350; SSE2-NEXT: movdqa 48(%rdi), %xmm12 4351; SSE2-NEXT: movdqa 80(%rdi), %xmm2 4352; SSE2-NEXT: movdqa 64(%rdi), %xmm5 4353; SSE2-NEXT: movdqa 112(%rdi), %xmm3 4354; SSE2-NEXT: movdqa 96(%rdi), %xmm14 4355; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 4356; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 4357; SSE2-NEXT: movdqa %xmm14, %xmm7 4358; SSE2-NEXT: pxor %xmm1, %xmm7 4359; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] 4360; SSE2-NEXT: pxor %xmm10, %xmm10 4361; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 4362; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 4363; SSE2-NEXT: movdqa %xmm6, %xmm4 4364; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 4365; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] 4366; SSE2-NEXT: pand %xmm0, %xmm7 4367; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] 4368; SSE2-NEXT: por %xmm7, %xmm13 4369; SSE2-NEXT: pand %xmm13, %xmm14 4370; SSE2-NEXT: pandn %xmm8, %xmm13 4371; SSE2-NEXT: por %xmm14, %xmm13 4372; SSE2-NEXT: movdqa %xmm3, %xmm0 4373; SSE2-NEXT: pxor %xmm1, %xmm0 4374; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 4375; SSE2-NEXT: pcmpeqd %xmm10, %xmm4 4376; SSE2-NEXT: movdqa %xmm6, %xmm7 4377; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 4378; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 4379; SSE2-NEXT: pand %xmm4, %xmm0 4380; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] 4381; SSE2-NEXT: por %xmm0, %xmm14 4382; SSE2-NEXT: pand %xmm14, %xmm3 4383; SSE2-NEXT: pandn %xmm8, %xmm14 4384; SSE2-NEXT: por %xmm3, %xmm14 4385; SSE2-NEXT: movdqa %xmm5, %xmm0 4386; SSE2-NEXT: pxor %xmm1, %xmm0 4387; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4388; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 4389; SSE2-NEXT: movdqa %xmm6, %xmm4 4390; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 4391; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4392; SSE2-NEXT: pand %xmm3, %xmm0 4393; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4394; SSE2-NEXT: por %xmm0, %xmm4 4395; SSE2-NEXT: pand %xmm4, %xmm5 4396; SSE2-NEXT: pandn %xmm8, %xmm4 4397; SSE2-NEXT: por %xmm5, %xmm4 4398; SSE2-NEXT: movdqa %xmm2, %xmm0 4399; SSE2-NEXT: pxor %xmm1, %xmm0 4400; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4401; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 4402; SSE2-NEXT: movdqa %xmm6, %xmm5 4403; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 4404; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4405; SSE2-NEXT: pand %xmm3, %xmm0 4406; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 4407; SSE2-NEXT: por %xmm0, %xmm5 4408; SSE2-NEXT: pand %xmm5, %xmm2 4409; SSE2-NEXT: pandn %xmm8, %xmm5 4410; SSE2-NEXT: por %xmm2, %xmm5 4411; SSE2-NEXT: movdqa %xmm15, %xmm0 4412; SSE2-NEXT: pxor %xmm1, %xmm0 4413; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4414; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 4415; SSE2-NEXT: movdqa %xmm6, %xmm3 4416; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 4417; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4418; SSE2-NEXT: pand %xmm2, %xmm0 4419; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] 4420; SSE2-NEXT: por %xmm0, %xmm7 4421; SSE2-NEXT: pand %xmm7, %xmm15 4422; SSE2-NEXT: pandn %xmm8, %xmm7 4423; SSE2-NEXT: por %xmm15, %xmm7 4424; SSE2-NEXT: movdqa %xmm12, %xmm0 4425; SSE2-NEXT: pxor %xmm1, %xmm0 4426; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4427; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 4428; SSE2-NEXT: movdqa %xmm6, %xmm3 4429; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 4430; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4431; SSE2-NEXT: pand %xmm2, %xmm0 4432; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3] 4433; SSE2-NEXT: por %xmm0, %xmm15 4434; SSE2-NEXT: pand %xmm15, %xmm12 4435; SSE2-NEXT: pandn %xmm8, %xmm15 4436; SSE2-NEXT: por %xmm12, %xmm15 4437; SSE2-NEXT: movdqa %xmm11, %xmm0 4438; SSE2-NEXT: pxor %xmm1, %xmm0 4439; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4440; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 4441; SSE2-NEXT: movdqa %xmm6, %xmm2 4442; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 4443; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 4444; SSE2-NEXT: pand %xmm3, %xmm0 4445; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] 4446; SSE2-NEXT: por %xmm0, %xmm12 4447; SSE2-NEXT: pand %xmm12, %xmm11 4448; SSE2-NEXT: pandn %xmm8, %xmm12 4449; SSE2-NEXT: por %xmm11, %xmm12 4450; SSE2-NEXT: movdqa %xmm9, %xmm0 4451; SSE2-NEXT: pxor %xmm1, %xmm0 4452; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4453; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 4454; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 4455; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4456; SSE2-NEXT: pand %xmm2, %xmm0 4457; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 4458; SSE2-NEXT: por %xmm0, %xmm2 4459; SSE2-NEXT: pand %xmm2, %xmm9 4460; SSE2-NEXT: pandn %xmm8, %xmm2 4461; SSE2-NEXT: por %xmm9, %xmm2 4462; SSE2-NEXT: movdqa %xmm2, %xmm0 4463; SSE2-NEXT: pxor %xmm1, %xmm0 4464; SSE2-NEXT: movdqa %xmm0, %xmm6 4465; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 4466; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] 4467; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 4468; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4469; SSE2-NEXT: pand %xmm8, %xmm0 4470; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 4471; SSE2-NEXT: por %xmm0, %xmm6 4472; SSE2-NEXT: pand %xmm2, %xmm6 4473; SSE2-NEXT: movdqa %xmm12, %xmm0 4474; SSE2-NEXT: pxor %xmm1, %xmm0 4475; SSE2-NEXT: movdqa %xmm0, %xmm2 4476; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 4477; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] 4478; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 4479; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4480; SSE2-NEXT: pand %xmm8, %xmm3 4481; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 4482; SSE2-NEXT: por %xmm3, %xmm0 4483; SSE2-NEXT: pand %xmm12, %xmm0 4484; SSE2-NEXT: packuswb %xmm6, %xmm0 4485; SSE2-NEXT: movdqa %xmm15, %xmm2 4486; SSE2-NEXT: pxor %xmm1, %xmm2 4487; SSE2-NEXT: movdqa %xmm2, %xmm3 4488; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 4489; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4490; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 4491; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4492; SSE2-NEXT: pand %xmm6, %xmm2 4493; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4494; SSE2-NEXT: por %xmm2, %xmm3 4495; SSE2-NEXT: pand %xmm15, %xmm3 4496; SSE2-NEXT: movdqa %xmm7, %xmm2 4497; SSE2-NEXT: pxor %xmm1, %xmm2 4498; SSE2-NEXT: movdqa %xmm2, %xmm6 4499; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 4500; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] 4501; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 4502; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4503; SSE2-NEXT: pand %xmm8, %xmm2 4504; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 4505; SSE2-NEXT: por %xmm2, %xmm6 4506; SSE2-NEXT: pand %xmm7, %xmm6 4507; SSE2-NEXT: packuswb %xmm3, %xmm6 4508; SSE2-NEXT: packuswb %xmm6, %xmm0 4509; SSE2-NEXT: movdqa %xmm5, %xmm2 4510; SSE2-NEXT: pxor %xmm1, %xmm2 4511; SSE2-NEXT: movdqa %xmm2, %xmm3 4512; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 4513; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4514; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 4515; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4516; SSE2-NEXT: pand %xmm6, %xmm2 4517; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4518; SSE2-NEXT: por %xmm2, %xmm3 4519; SSE2-NEXT: pand %xmm5, %xmm3 4520; SSE2-NEXT: movdqa %xmm4, %xmm2 4521; SSE2-NEXT: pxor %xmm1, %xmm2 4522; SSE2-NEXT: movdqa %xmm2, %xmm5 4523; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 4524; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4525; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 4526; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 4527; SSE2-NEXT: pand %xmm6, %xmm7 4528; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 4529; SSE2-NEXT: por %xmm7, %xmm2 4530; SSE2-NEXT: pand %xmm4, %xmm2 4531; SSE2-NEXT: packuswb %xmm3, %xmm2 4532; SSE2-NEXT: movdqa %xmm14, %xmm3 4533; SSE2-NEXT: pxor %xmm1, %xmm3 4534; SSE2-NEXT: movdqa %xmm3, %xmm4 4535; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 4536; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 4537; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 4538; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4539; SSE2-NEXT: pand %xmm5, %xmm3 4540; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4541; SSE2-NEXT: por %xmm3, %xmm4 4542; SSE2-NEXT: pand %xmm14, %xmm4 4543; SSE2-NEXT: movdqa %xmm13, %xmm3 4544; SSE2-NEXT: pxor %xmm1, %xmm3 4545; SSE2-NEXT: movdqa %xmm3, %xmm5 4546; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 4547; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4548; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 4549; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 4550; SSE2-NEXT: pand %xmm6, %xmm1 4551; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 4552; SSE2-NEXT: por %xmm1, %xmm3 4553; SSE2-NEXT: pand %xmm13, %xmm3 4554; SSE2-NEXT: packuswb %xmm4, %xmm3 4555; SSE2-NEXT: packuswb %xmm3, %xmm2 4556; SSE2-NEXT: packuswb %xmm2, %xmm0 4557; SSE2-NEXT: retq 4558; 4559; SSSE3-LABEL: trunc_packus_v16i64_v16i8: 4560; SSSE3: # %bb.0: 4561; SSSE3-NEXT: movdqa (%rdi), %xmm11 4562; SSSE3-NEXT: movdqa 16(%rdi), %xmm9 4563; SSSE3-NEXT: movdqa 32(%rdi), %xmm15 4564; SSSE3-NEXT: movdqa 48(%rdi), %xmm12 4565; SSSE3-NEXT: movdqa 80(%rdi), %xmm2 4566; SSSE3-NEXT: movdqa 64(%rdi), %xmm5 4567; SSSE3-NEXT: movdqa 112(%rdi), %xmm3 4568; SSSE3-NEXT: movdqa 96(%rdi), %xmm14 4569; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] 4570; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [2147483648,2147483648] 4571; SSSE3-NEXT: movdqa %xmm14, %xmm7 4572; SSSE3-NEXT: pxor %xmm1, %xmm7 4573; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3] 4574; SSSE3-NEXT: pxor %xmm10, %xmm10 4575; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 4576; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483903,2147483903] 4577; SSSE3-NEXT: movdqa %xmm6, %xmm4 4578; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 4579; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] 4580; SSSE3-NEXT: pand %xmm0, %xmm7 4581; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm4[1,1,3,3] 4582; SSSE3-NEXT: por %xmm7, %xmm13 4583; SSSE3-NEXT: pand %xmm13, %xmm14 4584; SSSE3-NEXT: pandn %xmm8, %xmm13 4585; SSSE3-NEXT: por %xmm14, %xmm13 4586; SSSE3-NEXT: movdqa %xmm3, %xmm0 4587; SSSE3-NEXT: pxor %xmm1, %xmm0 4588; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 4589; SSSE3-NEXT: pcmpeqd %xmm10, %xmm4 4590; SSSE3-NEXT: movdqa %xmm6, %xmm7 4591; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 4592; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 4593; SSSE3-NEXT: pand %xmm4, %xmm0 4594; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm7[1,1,3,3] 4595; SSSE3-NEXT: por %xmm0, %xmm14 4596; SSSE3-NEXT: pand %xmm14, %xmm3 4597; SSSE3-NEXT: pandn %xmm8, %xmm14 4598; SSSE3-NEXT: por %xmm3, %xmm14 4599; SSSE3-NEXT: movdqa %xmm5, %xmm0 4600; SSSE3-NEXT: pxor %xmm1, %xmm0 4601; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4602; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 4603; SSSE3-NEXT: movdqa %xmm6, %xmm4 4604; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 4605; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4606; SSSE3-NEXT: pand %xmm3, %xmm0 4607; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4608; SSSE3-NEXT: por %xmm0, %xmm4 4609; SSSE3-NEXT: pand %xmm4, %xmm5 4610; SSSE3-NEXT: pandn %xmm8, %xmm4 4611; SSSE3-NEXT: por %xmm5, %xmm4 4612; SSSE3-NEXT: movdqa %xmm2, %xmm0 4613; SSSE3-NEXT: pxor %xmm1, %xmm0 4614; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4615; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 4616; SSSE3-NEXT: movdqa %xmm6, %xmm5 4617; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 4618; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4619; SSSE3-NEXT: pand %xmm3, %xmm0 4620; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 4621; SSSE3-NEXT: por %xmm0, %xmm5 4622; SSSE3-NEXT: pand %xmm5, %xmm2 4623; SSSE3-NEXT: pandn %xmm8, %xmm5 4624; SSSE3-NEXT: por %xmm2, %xmm5 4625; SSSE3-NEXT: movdqa %xmm15, %xmm0 4626; SSSE3-NEXT: pxor %xmm1, %xmm0 4627; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4628; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 4629; SSSE3-NEXT: movdqa %xmm6, %xmm3 4630; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 4631; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4632; SSSE3-NEXT: pand %xmm2, %xmm0 4633; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] 4634; SSSE3-NEXT: por %xmm0, %xmm7 4635; SSSE3-NEXT: pand %xmm7, %xmm15 4636; SSSE3-NEXT: pandn %xmm8, %xmm7 4637; SSSE3-NEXT: por %xmm15, %xmm7 4638; SSSE3-NEXT: movdqa %xmm12, %xmm0 4639; SSSE3-NEXT: pxor %xmm1, %xmm0 4640; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4641; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 4642; SSSE3-NEXT: movdqa %xmm6, %xmm3 4643; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 4644; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4645; SSSE3-NEXT: pand %xmm2, %xmm0 4646; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm3[1,1,3,3] 4647; SSSE3-NEXT: por %xmm0, %xmm15 4648; SSSE3-NEXT: pand %xmm15, %xmm12 4649; SSSE3-NEXT: pandn %xmm8, %xmm15 4650; SSSE3-NEXT: por %xmm12, %xmm15 4651; SSSE3-NEXT: movdqa %xmm11, %xmm0 4652; SSSE3-NEXT: pxor %xmm1, %xmm0 4653; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4654; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 4655; SSSE3-NEXT: movdqa %xmm6, %xmm2 4656; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 4657; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] 4658; SSSE3-NEXT: pand %xmm3, %xmm0 4659; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,3,3] 4660; SSSE3-NEXT: por %xmm0, %xmm12 4661; SSSE3-NEXT: pand %xmm12, %xmm11 4662; SSSE3-NEXT: pandn %xmm8, %xmm12 4663; SSSE3-NEXT: por %xmm11, %xmm12 4664; SSSE3-NEXT: movdqa %xmm9, %xmm0 4665; SSSE3-NEXT: pxor %xmm1, %xmm0 4666; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 4667; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 4668; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 4669; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] 4670; SSSE3-NEXT: pand %xmm2, %xmm0 4671; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] 4672; SSSE3-NEXT: por %xmm0, %xmm2 4673; SSSE3-NEXT: pand %xmm2, %xmm9 4674; SSSE3-NEXT: pandn %xmm8, %xmm2 4675; SSSE3-NEXT: por %xmm9, %xmm2 4676; SSSE3-NEXT: movdqa %xmm2, %xmm0 4677; SSSE3-NEXT: pxor %xmm1, %xmm0 4678; SSSE3-NEXT: movdqa %xmm0, %xmm6 4679; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 4680; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] 4681; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 4682; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] 4683; SSSE3-NEXT: pand %xmm8, %xmm0 4684; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 4685; SSSE3-NEXT: por %xmm0, %xmm6 4686; SSSE3-NEXT: pand %xmm2, %xmm6 4687; SSSE3-NEXT: movdqa %xmm12, %xmm0 4688; SSSE3-NEXT: pxor %xmm1, %xmm0 4689; SSSE3-NEXT: movdqa %xmm0, %xmm2 4690; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 4691; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,2,2] 4692; SSSE3-NEXT: pcmpeqd %xmm1, %xmm0 4693; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 4694; SSSE3-NEXT: pand %xmm8, %xmm3 4695; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 4696; SSSE3-NEXT: por %xmm3, %xmm0 4697; SSSE3-NEXT: pand %xmm12, %xmm0 4698; SSSE3-NEXT: packuswb %xmm6, %xmm0 4699; SSSE3-NEXT: movdqa %xmm15, %xmm2 4700; SSSE3-NEXT: pxor %xmm1, %xmm2 4701; SSSE3-NEXT: movdqa %xmm2, %xmm3 4702; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 4703; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4704; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 4705; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4706; SSSE3-NEXT: pand %xmm6, %xmm2 4707; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4708; SSSE3-NEXT: por %xmm2, %xmm3 4709; SSSE3-NEXT: pand %xmm15, %xmm3 4710; SSSE3-NEXT: movdqa %xmm7, %xmm2 4711; SSSE3-NEXT: pxor %xmm1, %xmm2 4712; SSSE3-NEXT: movdqa %xmm2, %xmm6 4713; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 4714; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,0,2,2] 4715; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 4716; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4717; SSSE3-NEXT: pand %xmm8, %xmm2 4718; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 4719; SSSE3-NEXT: por %xmm2, %xmm6 4720; SSSE3-NEXT: pand %xmm7, %xmm6 4721; SSSE3-NEXT: packuswb %xmm3, %xmm6 4722; SSSE3-NEXT: packuswb %xmm6, %xmm0 4723; SSSE3-NEXT: movdqa %xmm5, %xmm2 4724; SSSE3-NEXT: pxor %xmm1, %xmm2 4725; SSSE3-NEXT: movdqa %xmm2, %xmm3 4726; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 4727; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] 4728; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 4729; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 4730; SSSE3-NEXT: pand %xmm6, %xmm2 4731; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4732; SSSE3-NEXT: por %xmm2, %xmm3 4733; SSSE3-NEXT: pand %xmm5, %xmm3 4734; SSSE3-NEXT: movdqa %xmm4, %xmm2 4735; SSSE3-NEXT: pxor %xmm1, %xmm2 4736; SSSE3-NEXT: movdqa %xmm2, %xmm5 4737; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 4738; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4739; SSSE3-NEXT: pcmpeqd %xmm1, %xmm2 4740; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 4741; SSSE3-NEXT: pand %xmm6, %xmm7 4742; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm5[1,1,3,3] 4743; SSSE3-NEXT: por %xmm7, %xmm2 4744; SSSE3-NEXT: pand %xmm4, %xmm2 4745; SSSE3-NEXT: packuswb %xmm3, %xmm2 4746; SSSE3-NEXT: movdqa %xmm14, %xmm3 4747; SSSE3-NEXT: pxor %xmm1, %xmm3 4748; SSSE3-NEXT: movdqa %xmm3, %xmm4 4749; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 4750; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] 4751; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 4752; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 4753; SSSE3-NEXT: pand %xmm5, %xmm3 4754; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 4755; SSSE3-NEXT: por %xmm3, %xmm4 4756; SSSE3-NEXT: pand %xmm14, %xmm4 4757; SSSE3-NEXT: movdqa %xmm13, %xmm3 4758; SSSE3-NEXT: pxor %xmm1, %xmm3 4759; SSSE3-NEXT: movdqa %xmm3, %xmm5 4760; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 4761; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] 4762; SSSE3-NEXT: pcmpeqd %xmm1, %xmm3 4763; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] 4764; SSSE3-NEXT: pand %xmm6, %xmm1 4765; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3] 4766; SSSE3-NEXT: por %xmm1, %xmm3 4767; SSSE3-NEXT: pand %xmm13, %xmm3 4768; SSSE3-NEXT: packuswb %xmm4, %xmm3 4769; SSSE3-NEXT: packuswb %xmm3, %xmm2 4770; SSSE3-NEXT: packuswb %xmm2, %xmm0 4771; SSSE3-NEXT: retq 4772; 4773; SSE41-LABEL: trunc_packus_v16i64_v16i8: 4774; SSE41: # %bb.0: 4775; SSE41-NEXT: movdqa (%rdi), %xmm10 4776; SSE41-NEXT: movdqa 16(%rdi), %xmm9 4777; SSE41-NEXT: movdqa 32(%rdi), %xmm14 4778; SSE41-NEXT: movdqa 48(%rdi), %xmm12 4779; SSE41-NEXT: movdqa 80(%rdi), %xmm15 4780; SSE41-NEXT: movdqa 64(%rdi), %xmm6 4781; SSE41-NEXT: movdqa 112(%rdi), %xmm13 4782; SSE41-NEXT: movdqa 96(%rdi), %xmm4 4783; SSE41-NEXT: movapd {{.*#+}} xmm1 = [255,255] 4784; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] 4785; SSE41-NEXT: movdqa %xmm4, %xmm0 4786; SSE41-NEXT: pxor %xmm2, %xmm0 4787; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [2147483903,2147483903] 4788; SSE41-NEXT: movdqa %xmm7, %xmm3 4789; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4790; SSE41-NEXT: movdqa %xmm7, %xmm5 4791; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 4792; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4793; SSE41-NEXT: pand %xmm3, %xmm0 4794; SSE41-NEXT: por %xmm5, %xmm0 4795; SSE41-NEXT: movapd %xmm1, %xmm8 4796; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm8 4797; SSE41-NEXT: movdqa %xmm13, %xmm0 4798; SSE41-NEXT: pxor %xmm2, %xmm0 4799; SSE41-NEXT: movdqa %xmm7, %xmm3 4800; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4801; SSE41-NEXT: movdqa %xmm7, %xmm4 4802; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 4803; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4804; SSE41-NEXT: pand %xmm3, %xmm0 4805; SSE41-NEXT: por %xmm4, %xmm0 4806; SSE41-NEXT: movapd %xmm1, %xmm11 4807; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm11 4808; SSE41-NEXT: movdqa %xmm6, %xmm0 4809; SSE41-NEXT: pxor %xmm2, %xmm0 4810; SSE41-NEXT: movdqa %xmm7, %xmm3 4811; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4812; SSE41-NEXT: movdqa %xmm7, %xmm4 4813; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 4814; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4815; SSE41-NEXT: pand %xmm3, %xmm0 4816; SSE41-NEXT: por %xmm4, %xmm0 4817; SSE41-NEXT: movapd %xmm1, %xmm13 4818; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm13 4819; SSE41-NEXT: movdqa %xmm15, %xmm0 4820; SSE41-NEXT: pxor %xmm2, %xmm0 4821; SSE41-NEXT: movdqa %xmm7, %xmm3 4822; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4823; SSE41-NEXT: movdqa %xmm7, %xmm4 4824; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 4825; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4826; SSE41-NEXT: pand %xmm3, %xmm0 4827; SSE41-NEXT: por %xmm4, %xmm0 4828; SSE41-NEXT: movapd %xmm1, %xmm6 4829; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm6 4830; SSE41-NEXT: movdqa %xmm14, %xmm0 4831; SSE41-NEXT: pxor %xmm2, %xmm0 4832; SSE41-NEXT: movdqa %xmm7, %xmm3 4833; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4834; SSE41-NEXT: movdqa %xmm7, %xmm4 4835; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 4836; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4837; SSE41-NEXT: pand %xmm3, %xmm0 4838; SSE41-NEXT: por %xmm4, %xmm0 4839; SSE41-NEXT: movapd %xmm1, %xmm15 4840; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm15 4841; SSE41-NEXT: movdqa %xmm12, %xmm0 4842; SSE41-NEXT: pxor %xmm2, %xmm0 4843; SSE41-NEXT: movdqa %xmm7, %xmm4 4844; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 4845; SSE41-NEXT: movdqa %xmm7, %xmm5 4846; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 4847; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4848; SSE41-NEXT: pand %xmm4, %xmm0 4849; SSE41-NEXT: por %xmm5, %xmm0 4850; SSE41-NEXT: movapd %xmm1, %xmm4 4851; SSE41-NEXT: blendvpd %xmm0, %xmm12, %xmm4 4852; SSE41-NEXT: movdqa %xmm10, %xmm0 4853; SSE41-NEXT: pxor %xmm2, %xmm0 4854; SSE41-NEXT: movdqa %xmm7, %xmm5 4855; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 4856; SSE41-NEXT: movdqa %xmm7, %xmm3 4857; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 4858; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4859; SSE41-NEXT: pand %xmm5, %xmm0 4860; SSE41-NEXT: por %xmm3, %xmm0 4861; SSE41-NEXT: movapd %xmm1, %xmm5 4862; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm5 4863; SSE41-NEXT: movdqa %xmm9, %xmm0 4864; SSE41-NEXT: pxor %xmm2, %xmm0 4865; SSE41-NEXT: movdqa %xmm7, %xmm3 4866; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 4867; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 4868; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] 4869; SSE41-NEXT: pand %xmm3, %xmm0 4870; SSE41-NEXT: por %xmm7, %xmm0 4871; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 4872; SSE41-NEXT: xorpd %xmm9, %xmm9 4873; SSE41-NEXT: movapd %xmm1, %xmm3 4874; SSE41-NEXT: xorpd %xmm2, %xmm3 4875; SSE41-NEXT: movapd %xmm3, %xmm7 4876; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 4877; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 4878; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4879; SSE41-NEXT: pand %xmm7, %xmm0 4880; SSE41-NEXT: por %xmm3, %xmm0 4881; SSE41-NEXT: pxor %xmm3, %xmm3 4882; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 4883; SSE41-NEXT: movapd %xmm5, %xmm1 4884; SSE41-NEXT: xorpd %xmm2, %xmm1 4885; SSE41-NEXT: movapd %xmm1, %xmm7 4886; SSE41-NEXT: pcmpeqd %xmm2, %xmm7 4887; SSE41-NEXT: pcmpgtd %xmm2, %xmm1 4888; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] 4889; SSE41-NEXT: pand %xmm7, %xmm0 4890; SSE41-NEXT: por %xmm1, %xmm0 4891; SSE41-NEXT: pxor %xmm1, %xmm1 4892; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 4893; SSE41-NEXT: packusdw %xmm3, %xmm1 4894; SSE41-NEXT: movapd %xmm4, %xmm3 4895; SSE41-NEXT: xorpd %xmm2, %xmm3 4896; SSE41-NEXT: movapd %xmm3, %xmm5 4897; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 4898; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 4899; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4900; SSE41-NEXT: pand %xmm5, %xmm0 4901; SSE41-NEXT: por %xmm3, %xmm0 4902; SSE41-NEXT: pxor %xmm3, %xmm3 4903; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm3 4904; SSE41-NEXT: movapd %xmm15, %xmm4 4905; SSE41-NEXT: xorpd %xmm2, %xmm4 4906; SSE41-NEXT: movapd %xmm4, %xmm5 4907; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 4908; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 4909; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4910; SSE41-NEXT: pand %xmm5, %xmm0 4911; SSE41-NEXT: por %xmm4, %xmm0 4912; SSE41-NEXT: pxor %xmm4, %xmm4 4913; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4 4914; SSE41-NEXT: packusdw %xmm3, %xmm4 4915; SSE41-NEXT: packusdw %xmm4, %xmm1 4916; SSE41-NEXT: movapd %xmm6, %xmm3 4917; SSE41-NEXT: xorpd %xmm2, %xmm3 4918; SSE41-NEXT: movapd %xmm3, %xmm4 4919; SSE41-NEXT: pcmpeqd %xmm2, %xmm4 4920; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 4921; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4922; SSE41-NEXT: pand %xmm4, %xmm0 4923; SSE41-NEXT: por %xmm3, %xmm0 4924; SSE41-NEXT: pxor %xmm4, %xmm4 4925; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm4 4926; SSE41-NEXT: movapd %xmm13, %xmm3 4927; SSE41-NEXT: xorpd %xmm2, %xmm3 4928; SSE41-NEXT: movapd %xmm3, %xmm5 4929; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 4930; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 4931; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] 4932; SSE41-NEXT: pand %xmm5, %xmm0 4933; SSE41-NEXT: por %xmm3, %xmm0 4934; SSE41-NEXT: pxor %xmm3, %xmm3 4935; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3 4936; SSE41-NEXT: packusdw %xmm4, %xmm3 4937; SSE41-NEXT: movapd %xmm11, %xmm4 4938; SSE41-NEXT: xorpd %xmm2, %xmm4 4939; SSE41-NEXT: movapd %xmm4, %xmm5 4940; SSE41-NEXT: pcmpeqd %xmm2, %xmm5 4941; SSE41-NEXT: pcmpgtd %xmm2, %xmm4 4942; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] 4943; SSE41-NEXT: pand %xmm5, %xmm0 4944; SSE41-NEXT: por %xmm4, %xmm0 4945; SSE41-NEXT: pxor %xmm4, %xmm4 4946; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 4947; SSE41-NEXT: movapd %xmm8, %xmm5 4948; SSE41-NEXT: xorpd %xmm2, %xmm5 4949; SSE41-NEXT: movapd %xmm5, %xmm6 4950; SSE41-NEXT: pcmpeqd %xmm2, %xmm6 4951; SSE41-NEXT: pcmpgtd %xmm2, %xmm5 4952; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] 4953; SSE41-NEXT: pand %xmm6, %xmm0 4954; SSE41-NEXT: por %xmm5, %xmm0 4955; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 4956; SSE41-NEXT: packusdw %xmm4, %xmm9 4957; SSE41-NEXT: packusdw %xmm9, %xmm3 4958; SSE41-NEXT: packuswb %xmm3, %xmm1 4959; SSE41-NEXT: movdqa %xmm1, %xmm0 4960; SSE41-NEXT: retq 4961; 4962; AVX1-LABEL: trunc_packus_v16i64_v16i8: 4963; AVX1: # %bb.0: 4964; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0 4965; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255] 4966; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1 4967; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm8 4968; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1 4969; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 4970; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm9 4971; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3 4972; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4 4973; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3 4974; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4 4975; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5 4976; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4 4977; AVX1-NEXT: vmovdqa (%rdi), %xmm5 4978; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6 4979; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7 4980; AVX1-NEXT: vmovdqa 48(%rdi), %xmm0 4981; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm1 4982; AVX1-NEXT: vblendvpd %xmm1, %xmm7, %xmm2, %xmm1 4983; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm7 4984; AVX1-NEXT: vblendvpd %xmm7, %xmm0, %xmm2, %xmm0 4985; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 4986; AVX1-NEXT: vblendvpd %xmm7, %xmm5, %xmm2, %xmm5 4987; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 4988; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm2, %xmm2 4989; AVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 4990; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm7 4991; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2 4992; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm7 4993; AVX1-NEXT: vpand %xmm5, %xmm7, %xmm5 4994; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 4995; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm5 4996; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 4997; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm5 4998; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 4999; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 5000; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 5001; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm1 5002; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 5003; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm2 5004; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 5005; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 5006; AVX1-NEXT: vpcmpgtq %xmm6, %xmm9, %xmm2 5007; AVX1-NEXT: vpand %xmm2, %xmm9, %xmm2 5008; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm3 5009; AVX1-NEXT: vpand %xmm3, %xmm8, %xmm3 5010; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 5011; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 5012; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5013; AVX1-NEXT: retq 5014; 5015; AVX2-LABEL: trunc_packus_v16i64_v16i8: 5016; AVX2: # %bb.0: 5017; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5018; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 5019; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 5020; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 5021; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 5022; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 5023; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 5024; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 5025; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 5026; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 5027; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 5028; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 5029; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 5030; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 5031; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 5032; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 5033; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 5034; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 5035; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 5036; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 5037; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 5038; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 5039; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 5040; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 5041; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 5042; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5043; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 5044; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5045; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5046; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 5047; AVX2-NEXT: vzeroupper 5048; AVX2-NEXT: retq 5049; 5050; AVX512-LABEL: trunc_packus_v16i64_v16i8: 5051; AVX512: # %bb.0: 5052; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 5053; AVX512-NEXT: vpmaxsq 64(%rdi), %zmm0, %zmm1 5054; AVX512-NEXT: vpmovusqb %zmm1, %xmm1 5055; AVX512-NEXT: vpmaxsq (%rdi), %zmm0, %zmm0 5056; AVX512-NEXT: vpmovusqb %zmm0, %xmm0 5057; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5058; AVX512-NEXT: vzeroupper 5059; AVX512-NEXT: retq 5060; 5061; SKX-LABEL: trunc_packus_v16i64_v16i8: 5062; SKX: # %bb.0: 5063; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 5064; SKX-NEXT: vpmaxsq 96(%rdi), %ymm0, %ymm1 5065; SKX-NEXT: vpmovusqb %ymm1, %xmm1 5066; SKX-NEXT: vpmaxsq 64(%rdi), %ymm0, %ymm2 5067; SKX-NEXT: vpmovusqb %ymm2, %xmm2 5068; SKX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 5069; SKX-NEXT: vpmaxsq 32(%rdi), %ymm0, %ymm2 5070; SKX-NEXT: vpmovusqb %ymm2, %xmm2 5071; SKX-NEXT: vpmaxsq (%rdi), %ymm0, %ymm0 5072; SKX-NEXT: vpmovusqb %ymm0, %xmm0 5073; SKX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 5074; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 5075; SKX-NEXT: vzeroupper 5076; SKX-NEXT: retq 5077 %a0 = load <16 x i64>, ptr %p0 5078 %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 5079 %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255> 5080 %3 = icmp sgt <16 x i64> %2, zeroinitializer 5081 %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer 5082 %5 = trunc <16 x i64> %4 to <16 x i8> 5083 ret <16 x i8> %5 5084} 5085 5086define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" { 5087; SSE2-LABEL: trunc_packus_v4i32_v4i8: 5088; SSE2: # %bb.0: 5089; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 5090; SSE2-NEXT: movdqa %xmm1, %xmm2 5091; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 5092; SSE2-NEXT: pand %xmm2, %xmm0 5093; SSE2-NEXT: pandn %xmm1, %xmm2 5094; SSE2-NEXT: por %xmm2, %xmm0 5095; SSE2-NEXT: pxor %xmm1, %xmm1 5096; SSE2-NEXT: movdqa %xmm0, %xmm2 5097; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 5098; SSE2-NEXT: pand %xmm2, %xmm0 5099; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5100; SSE2-NEXT: packuswb %xmm0, %xmm0 5101; SSE2-NEXT: packuswb %xmm0, %xmm0 5102; SSE2-NEXT: retq 5103; 5104; SSSE3-LABEL: trunc_packus_v4i32_v4i8: 5105; SSSE3: # %bb.0: 5106; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 5107; SSSE3-NEXT: movdqa %xmm1, %xmm2 5108; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 5109; SSSE3-NEXT: pand %xmm2, %xmm0 5110; SSSE3-NEXT: pandn %xmm1, %xmm2 5111; SSSE3-NEXT: por %xmm2, %xmm0 5112; SSSE3-NEXT: pxor %xmm1, %xmm1 5113; SSSE3-NEXT: movdqa %xmm0, %xmm2 5114; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 5115; SSSE3-NEXT: pand %xmm2, %xmm0 5116; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 5117; SSSE3-NEXT: retq 5118; 5119; SSE41-LABEL: trunc_packus_v4i32_v4i8: 5120; SSE41: # %bb.0: 5121; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5122; SSE41-NEXT: pxor %xmm1, %xmm1 5123; SSE41-NEXT: pmaxsd %xmm1, %xmm0 5124; SSE41-NEXT: packusdw %xmm0, %xmm0 5125; SSE41-NEXT: packuswb %xmm0, %xmm0 5126; SSE41-NEXT: retq 5127; 5128; AVX1-LABEL: trunc_packus_v4i32_v4i8: 5129; AVX1: # %bb.0: 5130; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5131; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 5132; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5133; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 5134; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5135; AVX1-NEXT: retq 5136; 5137; AVX2-LABEL: trunc_packus_v4i32_v4i8: 5138; AVX2: # %bb.0: 5139; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] 5140; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 5141; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5142; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5143; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 5144; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5145; AVX2-NEXT: retq 5146; 5147; AVX512F-LABEL: trunc_packus_v4i32_v4i8: 5148; AVX512F: # %bb.0: 5149; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5150; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5151; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 5152; AVX512F-NEXT: vzeroupper 5153; AVX512F-NEXT: retq 5154; 5155; AVX512VL-LABEL: trunc_packus_v4i32_v4i8: 5156; AVX512VL: # %bb.0: 5157; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5158; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5159; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 5160; AVX512VL-NEXT: retq 5161; 5162; AVX512BW-LABEL: trunc_packus_v4i32_v4i8: 5163; AVX512BW: # %bb.0: 5164; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 5165; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5166; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 5167; AVX512BW-NEXT: vzeroupper 5168; AVX512BW-NEXT: retq 5169; 5170; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8: 5171; AVX512BWVL: # %bb.0: 5172; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5173; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5174; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 5175; AVX512BWVL-NEXT: retq 5176; 5177; SKX-LABEL: trunc_packus_v4i32_v4i8: 5178; SKX: # %bb.0: 5179; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5180; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5181; SKX-NEXT: vpmovusdb %xmm0, %xmm0 5182; SKX-NEXT: retq 5183 %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255> 5184 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255> 5185 %3 = icmp sgt <4 x i32> %2, zeroinitializer 5186 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 5187 %5 = trunc <4 x i32> %4 to <4 x i8> 5188 ret <4 x i8> %5 5189} 5190 5191define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { 5192; SSE2-LABEL: trunc_packus_v4i32_v4i8_store: 5193; SSE2: # %bb.0: 5194; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 5195; SSE2-NEXT: movdqa %xmm1, %xmm2 5196; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 5197; SSE2-NEXT: pand %xmm2, %xmm0 5198; SSE2-NEXT: pandn %xmm1, %xmm2 5199; SSE2-NEXT: por %xmm0, %xmm2 5200; SSE2-NEXT: pxor %xmm0, %xmm0 5201; SSE2-NEXT: movdqa %xmm2, %xmm1 5202; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 5203; SSE2-NEXT: pand %xmm2, %xmm1 5204; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 5205; SSE2-NEXT: packuswb %xmm1, %xmm1 5206; SSE2-NEXT: packuswb %xmm1, %xmm1 5207; SSE2-NEXT: movd %xmm1, (%rdi) 5208; SSE2-NEXT: retq 5209; 5210; SSSE3-LABEL: trunc_packus_v4i32_v4i8_store: 5211; SSSE3: # %bb.0: 5212; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] 5213; SSSE3-NEXT: movdqa %xmm1, %xmm2 5214; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 5215; SSSE3-NEXT: pand %xmm2, %xmm0 5216; SSSE3-NEXT: pandn %xmm1, %xmm2 5217; SSSE3-NEXT: por %xmm0, %xmm2 5218; SSSE3-NEXT: pxor %xmm0, %xmm0 5219; SSSE3-NEXT: movdqa %xmm2, %xmm1 5220; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 5221; SSSE3-NEXT: pand %xmm2, %xmm1 5222; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] 5223; SSSE3-NEXT: movd %xmm1, (%rdi) 5224; SSSE3-NEXT: retq 5225; 5226; SSE41-LABEL: trunc_packus_v4i32_v4i8_store: 5227; SSE41: # %bb.0: 5228; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 5229; SSE41-NEXT: pxor %xmm1, %xmm1 5230; SSE41-NEXT: pmaxsd %xmm0, %xmm1 5231; SSE41-NEXT: packusdw %xmm1, %xmm1 5232; SSE41-NEXT: packuswb %xmm1, %xmm1 5233; SSE41-NEXT: movd %xmm1, (%rdi) 5234; SSE41-NEXT: retq 5235; 5236; AVX1-LABEL: trunc_packus_v4i32_v4i8_store: 5237; AVX1: # %bb.0: 5238; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 5239; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 5240; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5241; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 5242; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5243; AVX1-NEXT: vmovd %xmm0, (%rdi) 5244; AVX1-NEXT: retq 5245; 5246; AVX2-LABEL: trunc_packus_v4i32_v4i8_store: 5247; AVX2: # %bb.0: 5248; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] 5249; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 5250; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 5251; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5252; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 5253; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5254; AVX2-NEXT: vmovd %xmm0, (%rdi) 5255; AVX2-NEXT: retq 5256; 5257; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store: 5258; AVX512F: # %bb.0: 5259; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 5260; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5261; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 5262; AVX512F-NEXT: vmovd %xmm0, (%rdi) 5263; AVX512F-NEXT: vzeroupper 5264; AVX512F-NEXT: retq 5265; 5266; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store: 5267; AVX512VL: # %bb.0: 5268; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5269; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5270; AVX512VL-NEXT: vpmovusdb %xmm0, (%rdi) 5271; AVX512VL-NEXT: retq 5272; 5273; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: 5274; AVX512BW: # %bb.0: 5275; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 5276; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5277; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 5278; AVX512BW-NEXT: vmovd %xmm0, (%rdi) 5279; AVX512BW-NEXT: vzeroupper 5280; AVX512BW-NEXT: retq 5281; 5282; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store: 5283; AVX512BWVL: # %bb.0: 5284; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5285; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5286; AVX512BWVL-NEXT: vpmovusdb %xmm0, (%rdi) 5287; AVX512BWVL-NEXT: retq 5288; 5289; SKX-LABEL: trunc_packus_v4i32_v4i8_store: 5290; SKX: # %bb.0: 5291; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5292; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 5293; SKX-NEXT: vpmovusdb %xmm0, (%rdi) 5294; SKX-NEXT: retq 5295 %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255> 5296 %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255> 5297 %3 = icmp sgt <4 x i32> %2, zeroinitializer 5298 %4 = select <4 x i1> %3, <4 x i32> %2, <4 x i32> zeroinitializer 5299 %5 = trunc <4 x i32> %4 to <4 x i8> 5300 store <4 x i8> %5, ptr%p1 5301 ret void 5302} 5303 5304define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { 5305; SSE-LABEL: trunc_packus_v8i32_v8i8: 5306; SSE: # %bb.0: 5307; SSE-NEXT: packssdw %xmm1, %xmm0 5308; SSE-NEXT: packuswb %xmm0, %xmm0 5309; SSE-NEXT: retq 5310; 5311; AVX1-LABEL: trunc_packus_v8i32_v8i8: 5312; AVX1: # %bb.0: 5313; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5314; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5315; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5316; AVX1-NEXT: vzeroupper 5317; AVX1-NEXT: retq 5318; 5319; AVX2-LABEL: trunc_packus_v8i32_v8i8: 5320; AVX2: # %bb.0: 5321; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5322; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5323; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5324; AVX2-NEXT: vzeroupper 5325; AVX2-NEXT: retq 5326; 5327; AVX512F-LABEL: trunc_packus_v8i32_v8i8: 5328; AVX512F: # %bb.0: 5329; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 5330; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5331; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5332; AVX512F-NEXT: vzeroupper 5333; AVX512F-NEXT: retq 5334; 5335; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: 5336; AVX512VL: # %bb.0: 5337; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5338; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5339; AVX512VL-NEXT: vpmovusdb %ymm0, %xmm0 5340; AVX512VL-NEXT: vzeroupper 5341; AVX512VL-NEXT: retq 5342; 5343; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: 5344; AVX512BW: # %bb.0: 5345; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 5346; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5347; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5348; AVX512BW-NEXT: vzeroupper 5349; AVX512BW-NEXT: retq 5350; 5351; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: 5352; AVX512BWVL: # %bb.0: 5353; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5354; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5355; AVX512BWVL-NEXT: vpmovusdb %ymm0, %xmm0 5356; AVX512BWVL-NEXT: vzeroupper 5357; AVX512BWVL-NEXT: retq 5358; 5359; SKX-LABEL: trunc_packus_v8i32_v8i8: 5360; SKX: # %bb.0: 5361; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5362; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5363; SKX-NEXT: vpmovusdb %ymm0, %xmm0 5364; SKX-NEXT: vzeroupper 5365; SKX-NEXT: retq 5366 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5367 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5368 %3 = icmp sgt <8 x i32> %2, zeroinitializer 5369 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 5370 %5 = trunc <8 x i32> %4 to <8 x i8> 5371 ret <8 x i8> %5 5372} 5373 5374define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, ptr%p1) { 5375; SSE-LABEL: trunc_packus_v8i32_v8i8_store: 5376; SSE: # %bb.0: 5377; SSE-NEXT: packssdw %xmm1, %xmm0 5378; SSE-NEXT: packuswb %xmm0, %xmm0 5379; SSE-NEXT: movq %xmm0, (%rdi) 5380; SSE-NEXT: retq 5381; 5382; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: 5383; AVX1: # %bb.0: 5384; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5385; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5386; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5387; AVX1-NEXT: vmovq %xmm0, (%rdi) 5388; AVX1-NEXT: vzeroupper 5389; AVX1-NEXT: retq 5390; 5391; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: 5392; AVX2: # %bb.0: 5393; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5394; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5395; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5396; AVX2-NEXT: vmovq %xmm0, (%rdi) 5397; AVX2-NEXT: vzeroupper 5398; AVX2-NEXT: retq 5399; 5400; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: 5401; AVX512F: # %bb.0: 5402; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 5403; AVX512F-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5404; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5405; AVX512F-NEXT: vmovq %xmm0, (%rdi) 5406; AVX512F-NEXT: vzeroupper 5407; AVX512F-NEXT: retq 5408; 5409; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: 5410; AVX512VL: # %bb.0: 5411; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5412; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5413; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) 5414; AVX512VL-NEXT: vzeroupper 5415; AVX512VL-NEXT: retq 5416; 5417; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: 5418; AVX512BW: # %bb.0: 5419; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 5420; AVX512BW-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 5421; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5422; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 5423; AVX512BW-NEXT: vzeroupper 5424; AVX512BW-NEXT: retq 5425; 5426; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: 5427; AVX512BWVL: # %bb.0: 5428; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5429; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5430; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) 5431; AVX512BWVL-NEXT: vzeroupper 5432; AVX512BWVL-NEXT: retq 5433; 5434; SKX-LABEL: trunc_packus_v8i32_v8i8_store: 5435; SKX: # %bb.0: 5436; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5437; SKX-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 5438; SKX-NEXT: vpmovusdb %ymm0, (%rdi) 5439; SKX-NEXT: vzeroupper 5440; SKX-NEXT: retq 5441 %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5442 %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5443 %3 = icmp sgt <8 x i32> %2, zeroinitializer 5444 %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer 5445 %5 = trunc <8 x i32> %4 to <8 x i8> 5446 store <8 x i8> %5, ptr%p1 5447 ret void 5448} 5449 5450define <16 x i8> @trunc_packus_v16i32_v16i8(ptr %p0) "min-legal-vector-width"="256" { 5451; SSE-LABEL: trunc_packus_v16i32_v16i8: 5452; SSE: # %bb.0: 5453; SSE-NEXT: movdqa (%rdi), %xmm0 5454; SSE-NEXT: movdqa 32(%rdi), %xmm1 5455; SSE-NEXT: packssdw 48(%rdi), %xmm1 5456; SSE-NEXT: packssdw 16(%rdi), %xmm0 5457; SSE-NEXT: packuswb %xmm1, %xmm0 5458; SSE-NEXT: retq 5459; 5460; AVX1-LABEL: trunc_packus_v16i32_v16i8: 5461; AVX1: # %bb.0: 5462; AVX1-NEXT: vmovdqa (%rdi), %xmm0 5463; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 5464; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 5465; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 5466; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5467; AVX1-NEXT: retq 5468; 5469; AVX2-LABEL: trunc_packus_v16i32_v16i8: 5470; AVX2: # %bb.0: 5471; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5472; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 5473; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5474; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5475; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 5476; AVX2-NEXT: vzeroupper 5477; AVX2-NEXT: retq 5478; 5479; AVX512-LABEL: trunc_packus_v16i32_v16i8: 5480; AVX512: # %bb.0: 5481; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 5482; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 5483; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 5484; AVX512-NEXT: vzeroupper 5485; AVX512-NEXT: retq 5486; 5487; SKX-LABEL: trunc_packus_v16i32_v16i8: 5488; SKX: # %bb.0: 5489; SKX-NEXT: vmovdqa (%rdi), %ymm0 5490; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 5491; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5492; SKX-NEXT: vpmovuswb %ymm0, %xmm0 5493; SKX-NEXT: vzeroupper 5494; SKX-NEXT: retq 5495 %a0 = load <16 x i32>, ptr %p0 5496 %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5497 %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5498 %3 = icmp sgt <16 x i32> %2, zeroinitializer 5499 %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer 5500 %5 = trunc <16 x i32> %4 to <16 x i8> 5501 ret <16 x i8> %5 5502} 5503 5504define void @trunc_packus_v16i32_v16i8_store(ptr %p0, ptr %p1) "min-legal-vector-width"="256" { 5505; SSE-LABEL: trunc_packus_v16i32_v16i8_store: 5506; SSE: # %bb.0: 5507; SSE-NEXT: movdqa (%rdi), %xmm0 5508; SSE-NEXT: movdqa 32(%rdi), %xmm1 5509; SSE-NEXT: packssdw 48(%rdi), %xmm1 5510; SSE-NEXT: packssdw 16(%rdi), %xmm0 5511; SSE-NEXT: packuswb %xmm1, %xmm0 5512; SSE-NEXT: movdqa %xmm0, (%rsi) 5513; SSE-NEXT: retq 5514; 5515; AVX1-LABEL: trunc_packus_v16i32_v16i8_store: 5516; AVX1: # %bb.0: 5517; AVX1-NEXT: vmovdqa (%rdi), %xmm0 5518; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 5519; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 5520; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 5521; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5522; AVX1-NEXT: vmovdqa %xmm0, (%rsi) 5523; AVX1-NEXT: retq 5524; 5525; AVX2-LABEL: trunc_packus_v16i32_v16i8_store: 5526; AVX2: # %bb.0: 5527; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5528; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 5529; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5530; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5531; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 5532; AVX2-NEXT: vmovdqa %xmm0, (%rsi) 5533; AVX2-NEXT: vzeroupper 5534; AVX2-NEXT: retq 5535; 5536; AVX512-LABEL: trunc_packus_v16i32_v16i8_store: 5537; AVX512: # %bb.0: 5538; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 5539; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm0 5540; AVX512-NEXT: vpmovusdb %zmm0, (%rsi) 5541; AVX512-NEXT: vzeroupper 5542; AVX512-NEXT: retq 5543; 5544; SKX-LABEL: trunc_packus_v16i32_v16i8_store: 5545; SKX: # %bb.0: 5546; SKX-NEXT: vmovdqa (%rdi), %ymm0 5547; SKX-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 5548; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5549; SKX-NEXT: vpmovuswb %ymm0, (%rsi) 5550; SKX-NEXT: vzeroupper 5551; SKX-NEXT: retq 5552 %a = load <16 x i32>, ptr %p0 5553 %b = icmp slt <16 x i32> %a, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5554 %c = select <16 x i1> %b, <16 x i32> %a, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5555 %d = icmp sgt <16 x i32> %c, zeroinitializer 5556 %e = select <16 x i1> %d, <16 x i32> %c, <16 x i32> zeroinitializer 5557 %f = trunc <16 x i32> %e to <16 x i8> 5558 store <16 x i8> %f, ptr %p1 5559 ret void 5560} 5561 5562define <8 x i8> @trunc_packus_v8i16_v8i8(<8 x i16> %a0) { 5563; SSE-LABEL: trunc_packus_v8i16_v8i8: 5564; SSE: # %bb.0: 5565; SSE-NEXT: packuswb %xmm0, %xmm0 5566; SSE-NEXT: retq 5567; 5568; AVX-LABEL: trunc_packus_v8i16_v8i8: 5569; AVX: # %bb.0: 5570; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5571; AVX-NEXT: retq 5572; 5573; AVX512-LABEL: trunc_packus_v8i16_v8i8: 5574; AVX512: # %bb.0: 5575; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5576; AVX512-NEXT: retq 5577; 5578; SKX-LABEL: trunc_packus_v8i16_v8i8: 5579; SKX: # %bb.0: 5580; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5581; SKX-NEXT: retq 5582 %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5583 %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5584 %3 = icmp sgt <8 x i16> %2, zeroinitializer 5585 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 5586 %5 = trunc <8 x i16> %4 to <8 x i8> 5587 ret <8 x i8> %5 5588} 5589 5590define void @trunc_packus_v8i16_v8i8_store(<8 x i16> %a0, ptr%p1) { 5591; SSE-LABEL: trunc_packus_v8i16_v8i8_store: 5592; SSE: # %bb.0: 5593; SSE-NEXT: packuswb %xmm0, %xmm0 5594; SSE-NEXT: movq %xmm0, (%rdi) 5595; SSE-NEXT: retq 5596; 5597; AVX-LABEL: trunc_packus_v8i16_v8i8_store: 5598; AVX: # %bb.0: 5599; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5600; AVX-NEXT: vmovq %xmm0, (%rdi) 5601; AVX-NEXT: retq 5602; 5603; AVX512F-LABEL: trunc_packus_v8i16_v8i8_store: 5604; AVX512F: # %bb.0: 5605; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5606; AVX512F-NEXT: vmovq %xmm0, (%rdi) 5607; AVX512F-NEXT: retq 5608; 5609; AVX512VL-LABEL: trunc_packus_v8i16_v8i8_store: 5610; AVX512VL: # %bb.0: 5611; AVX512VL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5612; AVX512VL-NEXT: vmovq %xmm0, (%rdi) 5613; AVX512VL-NEXT: retq 5614; 5615; AVX512BW-LABEL: trunc_packus_v8i16_v8i8_store: 5616; AVX512BW: # %bb.0: 5617; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 5618; AVX512BW-NEXT: vmovq %xmm0, (%rdi) 5619; AVX512BW-NEXT: retq 5620; 5621; AVX512BWVL-LABEL: trunc_packus_v8i16_v8i8_store: 5622; AVX512BWVL: # %bb.0: 5623; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5624; AVX512BWVL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 5625; AVX512BWVL-NEXT: vpmovuswb %xmm0, (%rdi) 5626; AVX512BWVL-NEXT: retq 5627; 5628; SKX-LABEL: trunc_packus_v8i16_v8i8_store: 5629; SKX: # %bb.0: 5630; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5631; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 5632; SKX-NEXT: vpmovuswb %xmm0, (%rdi) 5633; SKX-NEXT: retq 5634 %1 = icmp slt <8 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5635 %2 = select <8 x i1> %1, <8 x i16> %a0, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5636 %3 = icmp sgt <8 x i16> %2, zeroinitializer 5637 %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer 5638 %5 = trunc <8 x i16> %4 to <8 x i8> 5639 store <8 x i8> %5, ptr%p1 5640 ret void 5641} 5642 5643define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { 5644; SSE-LABEL: trunc_packus_v16i16_v16i8: 5645; SSE: # %bb.0: 5646; SSE-NEXT: packuswb %xmm1, %xmm0 5647; SSE-NEXT: retq 5648; 5649; AVX1-LABEL: trunc_packus_v16i16_v16i8: 5650; AVX1: # %bb.0: 5651; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 5652; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5653; AVX1-NEXT: vzeroupper 5654; AVX1-NEXT: retq 5655; 5656; AVX2-LABEL: trunc_packus_v16i16_v16i8: 5657; AVX2: # %bb.0: 5658; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 5659; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5660; AVX2-NEXT: vzeroupper 5661; AVX2-NEXT: retq 5662; 5663; AVX512F-LABEL: trunc_packus_v16i16_v16i8: 5664; AVX512F: # %bb.0: 5665; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 5666; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5667; AVX512F-NEXT: vzeroupper 5668; AVX512F-NEXT: retq 5669; 5670; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: 5671; AVX512VL: # %bb.0: 5672; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 5673; AVX512VL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5674; AVX512VL-NEXT: vzeroupper 5675; AVX512VL-NEXT: retq 5676; 5677; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: 5678; AVX512BW: # %bb.0: 5679; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 5680; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5681; AVX512BW-NEXT: vzeroupper 5682; AVX512BW-NEXT: retq 5683; 5684; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: 5685; AVX512BWVL: # %bb.0: 5686; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 5687; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 5688; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 5689; AVX512BWVL-NEXT: vzeroupper 5690; AVX512BWVL-NEXT: retq 5691; 5692; SKX-LABEL: trunc_packus_v16i16_v16i8: 5693; SKX: # %bb.0: 5694; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5695; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 5696; SKX-NEXT: vpmovuswb %ymm0, %xmm0 5697; SKX-NEXT: vzeroupper 5698; SKX-NEXT: retq 5699 %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5700 %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5701 %3 = icmp sgt <16 x i16> %2, zeroinitializer 5702 %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer 5703 %5 = trunc <16 x i16> %4 to <16 x i8> 5704 ret <16 x i8> %5 5705} 5706 5707define <32 x i8> @trunc_packus_v32i16_v32i8(ptr %p0) "min-legal-vector-width"="256" { 5708; SSE-LABEL: trunc_packus_v32i16_v32i8: 5709; SSE: # %bb.0: 5710; SSE-NEXT: movdqa (%rdi), %xmm0 5711; SSE-NEXT: movdqa 32(%rdi), %xmm1 5712; SSE-NEXT: packuswb 16(%rdi), %xmm0 5713; SSE-NEXT: packuswb 48(%rdi), %xmm1 5714; SSE-NEXT: retq 5715; 5716; AVX1-LABEL: trunc_packus_v32i16_v32i8: 5717; AVX1: # %bb.0: 5718; AVX1-NEXT: vmovdqa (%rdi), %xmm0 5719; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 5720; AVX1-NEXT: vpackuswb 48(%rdi), %xmm1, %xmm1 5721; AVX1-NEXT: vpackuswb 16(%rdi), %xmm0, %xmm0 5722; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 5723; AVX1-NEXT: retq 5724; 5725; AVX2-LABEL: trunc_packus_v32i16_v32i8: 5726; AVX2: # %bb.0: 5727; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5728; AVX2-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 5729; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5730; AVX2-NEXT: retq 5731; 5732; AVX512F-LABEL: trunc_packus_v32i16_v32i8: 5733; AVX512F: # %bb.0: 5734; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 5735; AVX512F-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 5736; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5737; AVX512F-NEXT: retq 5738; 5739; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: 5740; AVX512VL: # %bb.0: 5741; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 5742; AVX512VL-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 5743; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5744; AVX512VL-NEXT: retq 5745; 5746; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: 5747; AVX512BW: # %bb.0: 5748; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 5749; AVX512BW-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 5750; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 5751; AVX512BW-NEXT: retq 5752; 5753; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: 5754; AVX512BWVL: # %bb.0: 5755; AVX512BWVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 5756; AVX512BWVL-NEXT: vpmaxsw (%rdi), %zmm0, %zmm0 5757; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 5758; AVX512BWVL-NEXT: retq 5759; 5760; SKX-LABEL: trunc_packus_v32i16_v32i8: 5761; SKX: # %bb.0: 5762; SKX-NEXT: vmovdqa (%rdi), %ymm0 5763; SKX-NEXT: vpackuswb 32(%rdi), %ymm0, %ymm0 5764; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5765; SKX-NEXT: retq 5766 %a0 = load <32 x i16>, ptr %p0 5767 %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5768 %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 5769 %3 = icmp sgt <32 x i16> %2, zeroinitializer 5770 %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer 5771 %5 = trunc <32 x i16> %4 to <32 x i8> 5772 ret <32 x i8> %5 5773} 5774 5775define <32 x i8> @trunc_packus_v32i32_v32i8(ptr %p0) "min-legal-vector-width"="256" { 5776; SSE-LABEL: trunc_packus_v32i32_v32i8: 5777; SSE: # %bb.0: 5778; SSE-NEXT: movdqa (%rdi), %xmm0 5779; SSE-NEXT: movdqa 32(%rdi), %xmm2 5780; SSE-NEXT: movdqa 64(%rdi), %xmm1 5781; SSE-NEXT: movdqa 96(%rdi), %xmm3 5782; SSE-NEXT: packssdw 48(%rdi), %xmm2 5783; SSE-NEXT: packssdw 16(%rdi), %xmm0 5784; SSE-NEXT: packuswb %xmm2, %xmm0 5785; SSE-NEXT: packssdw 112(%rdi), %xmm3 5786; SSE-NEXT: packssdw 80(%rdi), %xmm1 5787; SSE-NEXT: packuswb %xmm3, %xmm1 5788; SSE-NEXT: retq 5789; 5790; AVX1-LABEL: trunc_packus_v32i32_v32i8: 5791; AVX1: # %bb.0: 5792; AVX1-NEXT: vmovdqa (%rdi), %xmm0 5793; AVX1-NEXT: vmovdqa 32(%rdi), %xmm1 5794; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2 5795; AVX1-NEXT: vmovdqa 96(%rdi), %xmm3 5796; AVX1-NEXT: vpackssdw 112(%rdi), %xmm3, %xmm3 5797; AVX1-NEXT: vpackssdw 80(%rdi), %xmm2, %xmm2 5798; AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 5799; AVX1-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 5800; AVX1-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 5801; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 5802; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 5803; AVX1-NEXT: retq 5804; 5805; AVX2-LABEL: trunc_packus_v32i32_v32i8: 5806; AVX2: # %bb.0: 5807; AVX2-NEXT: vmovdqa (%rdi), %ymm0 5808; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1 5809; AVX2-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 5810; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 5811; AVX2-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 5812; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5813; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 5814; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5815; AVX2-NEXT: retq 5816; 5817; AVX512-LABEL: trunc_packus_v32i32_v32i8: 5818; AVX512: # %bb.0: 5819; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 5820; AVX512-NEXT: vpmaxsd (%rdi), %zmm0, %zmm1 5821; AVX512-NEXT: vpmovusdb %zmm1, %xmm1 5822; AVX512-NEXT: vpmaxsd 64(%rdi), %zmm0, %zmm0 5823; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 5824; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 5825; AVX512-NEXT: retq 5826; 5827; SKX-LABEL: trunc_packus_v32i32_v32i8: 5828; SKX: # %bb.0: 5829; SKX-NEXT: vmovdqa (%rdi), %ymm0 5830; SKX-NEXT: vmovdqa 64(%rdi), %ymm1 5831; SKX-NEXT: vpackssdw 96(%rdi), %ymm1, %ymm1 5832; SKX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] 5833; SKX-NEXT: vpackssdw 32(%rdi), %ymm0, %ymm0 5834; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5835; SKX-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 5836; SKX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 5837; SKX-NEXT: retq 5838 %a0 = load <32 x i32>, ptr %p0 5839 %1 = icmp slt <32 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5840 %2 = select <32 x i1> %1, <32 x i32> %a0, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255> 5841 %3 = icmp sgt <32 x i32> %2, zeroinitializer 5842 %4 = select <32 x i1> %3, <32 x i32> %2, <32 x i32> zeroinitializer 5843 %5 = trunc <32 x i32> %4 to <32 x i8> 5844 ret <32 x i8> %5 5845} 5846