1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 12 13; 14; add 15; 16 17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 18; SSE-LABEL: trunc_add_v4i64_v4i32: 19; SSE: # %bb.0: 20; SSE-NEXT: paddq %xmm3, %xmm1 21; SSE-NEXT: paddq %xmm2, %xmm0 22; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc_add_v4i64_v4i32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 28; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 29; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 30; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 31; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 32; AVX1-NEXT: vzeroupper 33; AVX1-NEXT: retq 34; 35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 36; AVX2-SLOW: # %bb.0: 37; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 38; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 39; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 40; AVX2-SLOW-NEXT: vzeroupper 41; AVX2-SLOW-NEXT: retq 42; 43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: 44; AVX2-FAST-ALL: # %bb.0: 45; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 46; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 47; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 48; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 49; AVX2-FAST-ALL-NEXT: vzeroupper 50; AVX2-FAST-ALL-NEXT: retq 51; 52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: 53; AVX2-FAST-PERLANE: # %bb.0: 54; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 55; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 56; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 57; AVX2-FAST-PERLANE-NEXT: vzeroupper 58; AVX2-FAST-PERLANE-NEXT: retq 59; 60; AVX512-LABEL: trunc_add_v4i64_v4i32: 61; AVX512: # %bb.0: 62; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 63; AVX512-NEXT: vpmovqd %zmm0, %ymm0 64; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 65; AVX512-NEXT: vzeroupper 66; AVX512-NEXT: retq 67 %1 = add <4 x i64> %a0, %a1 68 %2 = trunc <4 x i64> %1 to <4 x i32> 69 ret <4 x i32> %2 70} 71 72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 73; SSE-LABEL: trunc_add_v8i64_v8i16: 74; SSE: # %bb.0: 75; SSE-NEXT: paddq %xmm6, %xmm2 76; SSE-NEXT: paddq %xmm7, %xmm3 77; SSE-NEXT: paddq %xmm4, %xmm0 78; SSE-NEXT: paddq %xmm5, %xmm1 79; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 80; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 81; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 82; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 83; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 84; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 85; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 86; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 87; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 88; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 89; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 90; SSE-NEXT: retq 91; 92; AVX1-LABEL: trunc_add_v8i64_v8i16: 93; AVX1: # %bb.0: 94; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 95; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 96; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 97; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 98; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 99; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 100; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 101; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 102; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 103; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 104; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 105; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 106; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 107; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 108; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 109; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 110; AVX1-NEXT: vzeroupper 111; AVX1-NEXT: retq 112; 113; AVX2-LABEL: trunc_add_v8i64_v8i16: 114; AVX2: # %bb.0: 115; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 116; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 117; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 118; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 119; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 120; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 121; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 122; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 123; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 124; AVX2-NEXT: vzeroupper 125; AVX2-NEXT: retq 126; 127; AVX512-LABEL: trunc_add_v8i64_v8i16: 128; AVX512: # %bb.0: 129; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 130; AVX512-NEXT: vpmovqw %zmm0, %xmm0 131; AVX512-NEXT: vzeroupper 132; AVX512-NEXT: retq 133 %1 = add <8 x i64> %a0, %a1 134 %2 = trunc <8 x i64> %1 to <8 x i16> 135 ret <8 x i16> %2 136} 137 138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 139; SSE-LABEL: trunc_add_v8i32_v8i16: 140; SSE: # %bb.0: 141; SSE-NEXT: paddd %xmm2, %xmm0 142; SSE-NEXT: paddd %xmm3, %xmm1 143; SSE-NEXT: pslld $16, %xmm1 144; SSE-NEXT: psrad $16, %xmm1 145; SSE-NEXT: pslld $16, %xmm0 146; SSE-NEXT: psrad $16, %xmm0 147; SSE-NEXT: packssdw %xmm1, %xmm0 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: trunc_add_v8i32_v8i16: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 153; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 154; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 155; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 156; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 157; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 158; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 159; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 160; AVX1-NEXT: vzeroupper 161; AVX1-NEXT: retq 162; 163; AVX2-LABEL: trunc_add_v8i32_v8i16: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 166; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 167; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 168; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 169; AVX2-NEXT: vzeroupper 170; AVX2-NEXT: retq 171; 172; AVX512-LABEL: trunc_add_v8i32_v8i16: 173; AVX512: # %bb.0: 174; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 175; AVX512-NEXT: vpmovdw %zmm0, %ymm0 176; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 177; AVX512-NEXT: vzeroupper 178; AVX512-NEXT: retq 179 %1 = add <8 x i32> %a0, %a1 180 %2 = trunc <8 x i32> %1 to <8 x i16> 181 ret <8 x i16> %2 182} 183 184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 185; SSE-LABEL: trunc_add_v16i64_v16i8: 186; SSE: # %bb.0: 187; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 188; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 189; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 190; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 191; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 192; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 193; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 194; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 195; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 196; SSE-NEXT: pand %xmm8, %xmm7 197; SSE-NEXT: pand %xmm8, %xmm6 198; SSE-NEXT: packuswb %xmm7, %xmm6 199; SSE-NEXT: pand %xmm8, %xmm5 200; SSE-NEXT: pand %xmm8, %xmm4 201; SSE-NEXT: packuswb %xmm5, %xmm4 202; SSE-NEXT: packuswb %xmm6, %xmm4 203; SSE-NEXT: pand %xmm8, %xmm3 204; SSE-NEXT: pand %xmm8, %xmm2 205; SSE-NEXT: packuswb %xmm3, %xmm2 206; SSE-NEXT: pand %xmm8, %xmm1 207; SSE-NEXT: pand %xmm8, %xmm0 208; SSE-NEXT: packuswb %xmm1, %xmm0 209; SSE-NEXT: packuswb %xmm2, %xmm0 210; SSE-NEXT: packuswb %xmm4, %xmm0 211; SSE-NEXT: retq 212; 213; AVX1-LABEL: trunc_add_v16i64_v16i8: 214; AVX1: # %bb.0: 215; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 216; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 218; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 219; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 220; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 221; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 222; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 223; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 224; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 225; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 226; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 227; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 228; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 229; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 230; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 231; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 232; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 233; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 234; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 235; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 236; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 237; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 238; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 239; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 240; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 241; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 242; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 243; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 244; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 245; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 246; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 247; AVX1-NEXT: vzeroupper 248; AVX1-NEXT: retq 249; 250; AVX2-LABEL: trunc_add_v16i64_v16i8: 251; AVX2: # %bb.0: 252; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 253; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 254; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 255; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 256; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 257; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 258; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 259; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 260; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 261; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 262; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 263; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 264; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 265; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 266; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 267; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 268; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 269; AVX2-NEXT: vzeroupper 270; AVX2-NEXT: retq 271; 272; AVX512-LABEL: trunc_add_v16i64_v16i8: 273; AVX512: # %bb.0: 274; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 275; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 276; AVX512-NEXT: vpmovqb %zmm1, %xmm1 277; AVX512-NEXT: vpmovqb %zmm0, %xmm0 278; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 279; AVX512-NEXT: vzeroupper 280; AVX512-NEXT: retq 281 %1 = add <16 x i64> %a0, %a1 282 %2 = trunc <16 x i64> %1 to <16 x i8> 283 ret <16 x i8> %2 284} 285 286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 287; SSE-LABEL: trunc_add_v16i32_v16i8: 288; SSE: # %bb.0: 289; SSE-NEXT: paddd %xmm4, %xmm0 290; SSE-NEXT: paddd %xmm5, %xmm1 291; SSE-NEXT: paddd %xmm6, %xmm2 292; SSE-NEXT: paddd %xmm7, %xmm3 293; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 294; SSE-NEXT: pand %xmm4, %xmm3 295; SSE-NEXT: pand %xmm4, %xmm2 296; SSE-NEXT: packuswb %xmm3, %xmm2 297; SSE-NEXT: pand %xmm4, %xmm1 298; SSE-NEXT: pand %xmm4, %xmm0 299; SSE-NEXT: packuswb %xmm1, %xmm0 300; SSE-NEXT: packuswb %xmm2, %xmm0 301; SSE-NEXT: retq 302; 303; AVX1-LABEL: trunc_add_v16i32_v16i8: 304; AVX1: # %bb.0: 305; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 306; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 307; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 308; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 309; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 310; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 311; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 312; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 313; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 314; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 315; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 316; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 317; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 318; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 319; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 320; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 321; AVX1-NEXT: vzeroupper 322; AVX1-NEXT: retq 323; 324; AVX2-LABEL: trunc_add_v16i32_v16i8: 325; AVX2: # %bb.0: 326; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 327; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 328; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 329; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 330; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 331; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 332; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 333; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 334; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 335; AVX2-NEXT: vzeroupper 336; AVX2-NEXT: retq 337; 338; AVX512-LABEL: trunc_add_v16i32_v16i8: 339; AVX512: # %bb.0: 340; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 341; AVX512-NEXT: vpmovdb %zmm0, %xmm0 342; AVX512-NEXT: vzeroupper 343; AVX512-NEXT: retq 344 %1 = add <16 x i32> %a0, %a1 345 %2 = trunc <16 x i32> %1 to <16 x i8> 346 ret <16 x i8> %2 347} 348 349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 350; SSE-LABEL: trunc_add_v16i16_v16i8: 351; SSE: # %bb.0: 352; SSE-NEXT: paddw %xmm2, %xmm0 353; SSE-NEXT: paddw %xmm3, %xmm1 354; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 355; SSE-NEXT: pand %xmm2, %xmm1 356; SSE-NEXT: pand %xmm2, %xmm0 357; SSE-NEXT: packuswb %xmm1, %xmm0 358; SSE-NEXT: retq 359; 360; AVX1-LABEL: trunc_add_v16i16_v16i8: 361; AVX1: # %bb.0: 362; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 363; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 364; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 365; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 366; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 367; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 368; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 369; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 370; AVX1-NEXT: vzeroupper 371; AVX1-NEXT: retq 372; 373; AVX2-LABEL: trunc_add_v16i16_v16i8: 374; AVX2: # %bb.0: 375; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 376; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 377; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 378; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 379; AVX2-NEXT: vzeroupper 380; AVX2-NEXT: retq 381; 382; AVX512F-LABEL: trunc_add_v16i16_v16i8: 383; AVX512F: # %bb.0: 384; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 385; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 386; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 387; AVX512F-NEXT: vzeroupper 388; AVX512F-NEXT: retq 389; 390; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 391; AVX512BW: # %bb.0: 392; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 393; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 394; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 395; AVX512BW-NEXT: vzeroupper 396; AVX512BW-NEXT: retq 397; 398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 399; AVX512DQ: # %bb.0: 400; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 401; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 402; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 403; AVX512DQ-NEXT: vzeroupper 404; AVX512DQ-NEXT: retq 405 %1 = add <16 x i16> %a0, %a1 406 %2 = trunc <16 x i16> %1 to <16 x i8> 407 ret <16 x i8> %2 408} 409 410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 412; SSE: # %bb.0: 413; SSE-NEXT: pslld $16, %xmm2 414; SSE-NEXT: psrad $16, %xmm2 415; SSE-NEXT: pslld $16, %xmm1 416; SSE-NEXT: psrad $16, %xmm1 417; SSE-NEXT: packssdw %xmm2, %xmm1 418; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 419; SSE-NEXT: psraw $8, %xmm0 420; SSE-NEXT: paddw %xmm1, %xmm0 421; SSE-NEXT: retq 422; 423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 426; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 427; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 428; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 429; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 430; AVX1-NEXT: vzeroupper 431; AVX1-NEXT: retq 432; 433; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 434; AVX2: # %bb.0: 435; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 436; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 437; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 438; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 439; AVX2-NEXT: vzeroupper 440; AVX2-NEXT: retq 441; 442; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 443; AVX512: # %bb.0: 444; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 445; AVX512-NEXT: vpmovdw %zmm1, %ymm1 446; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 447; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 448; AVX512-NEXT: vzeroupper 449; AVX512-NEXT: retq 450 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 451 %2 = sext <8 x i8> %1 to <8 x i32> 452 %3 = add <8 x i32> %2, %a1 453 %4 = trunc <8 x i32> %3 to <8 x i16> 454 ret <8 x i16> %4 455} 456 457; 458; add to constant 459; 460 461define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 462; SSE-LABEL: trunc_add_const_v4i64_v4i32: 463; SSE: # %bb.0: 464; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 465; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 466; SSE-NEXT: retq 467; 468; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 469; AVX1: # %bb.0: 470; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 471; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 472; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 473; AVX1-NEXT: vzeroupper 474; AVX1-NEXT: retq 475; 476; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 477; AVX2-SLOW: # %bb.0: 478; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 479; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 480; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 481; AVX2-SLOW-NEXT: vzeroupper 482; AVX2-SLOW-NEXT: retq 483; 484; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: 485; AVX2-FAST-ALL: # %bb.0: 486; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 487; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 488; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 489; AVX2-FAST-ALL-NEXT: vzeroupper 490; AVX2-FAST-ALL-NEXT: retq 491; 492; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: 493; AVX2-FAST-PERLANE: # %bb.0: 494; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 495; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 496; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 497; AVX2-FAST-PERLANE-NEXT: vzeroupper 498; AVX2-FAST-PERLANE-NEXT: retq 499; 500; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 501; AVX512: # %bb.0: 502; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 503; AVX512-NEXT: vpmovqd %zmm0, %ymm0 504; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 505; AVX512-NEXT: vzeroupper 506; AVX512-NEXT: retq 507 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 508 %2 = trunc <4 x i64> %1 to <4 x i32> 509 ret <4 x i32> %2 510} 511 512define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 513; SSE-LABEL: trunc_add_const_v8i64_v8i16: 514; SSE: # %bb.0: 515; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 516; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 517; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 518; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 519; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 520; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 521; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 522; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 523; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 524; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 525; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 526; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 527; SSE-NEXT: retq 528; 529; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 530; AVX1: # %bb.0: 531; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 532; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 533; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 534; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 535; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 536; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 537; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 538; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 539; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 540; AVX1-NEXT: vzeroupper 541; AVX1-NEXT: retq 542; 543; AVX2-LABEL: trunc_add_const_v8i64_v8i16: 544; AVX2: # %bb.0: 545; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 546; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 547; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 548; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 549; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 550; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 551; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 552; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 553; AVX2-NEXT: vzeroupper 554; AVX2-NEXT: retq 555; 556; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 557; AVX512: # %bb.0: 558; AVX512-NEXT: vpmovqw %zmm0, %xmm0 559; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 560; AVX512-NEXT: vzeroupper 561; AVX512-NEXT: retq 562 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 563 %2 = trunc <8 x i64> %1 to <8 x i16> 564 ret <8 x i16> %2 565} 566 567define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 568; SSE-LABEL: trunc_add_const_v8i32_v8i16: 569; SSE: # %bb.0: 570; SSE-NEXT: pslld $16, %xmm1 571; SSE-NEXT: psrad $16, %xmm1 572; SSE-NEXT: pslld $16, %xmm0 573; SSE-NEXT: psrad $16, %xmm0 574; SSE-NEXT: packssdw %xmm1, %xmm0 575; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 576; SSE-NEXT: retq 577; 578; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 579; AVX1: # %bb.0: 580; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 581; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 582; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 583; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 584; AVX1-NEXT: vzeroupper 585; AVX1-NEXT: retq 586; 587; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 588; AVX2: # %bb.0: 589; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 590; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 591; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 592; AVX2-NEXT: vzeroupper 593; AVX2-NEXT: retq 594; 595; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 596; AVX512: # %bb.0: 597; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 598; AVX512-NEXT: vpmovdw %zmm0, %ymm0 599; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 600; AVX512-NEXT: vzeroupper 601; AVX512-NEXT: retq 602 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 603 %2 = trunc <8 x i32> %1 to <8 x i16> 604 ret <8 x i16> %2 605} 606 607define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 608; SSE-LABEL: trunc_add_const_v16i64_v16i8: 609; SSE: # %bb.0: 610; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 611; SSE-NEXT: pand %xmm8, %xmm7 612; SSE-NEXT: pand %xmm8, %xmm6 613; SSE-NEXT: packuswb %xmm7, %xmm6 614; SSE-NEXT: pand %xmm8, %xmm5 615; SSE-NEXT: pand %xmm8, %xmm4 616; SSE-NEXT: packuswb %xmm5, %xmm4 617; SSE-NEXT: packuswb %xmm6, %xmm4 618; SSE-NEXT: pand %xmm8, %xmm3 619; SSE-NEXT: pand %xmm8, %xmm2 620; SSE-NEXT: packuswb %xmm3, %xmm2 621; SSE-NEXT: pand %xmm8, %xmm1 622; SSE-NEXT: pand %xmm8, %xmm0 623; SSE-NEXT: packuswb %xmm1, %xmm0 624; SSE-NEXT: packuswb %xmm2, %xmm0 625; SSE-NEXT: packuswb %xmm4, %xmm0 626; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 627; SSE-NEXT: retq 628; 629; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 630; AVX1: # %bb.0: 631; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 632; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 633; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 634; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 635; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 636; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 637; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 638; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 639; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 640; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 641; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 642; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 643; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 644; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 645; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 646; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 647; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 648; AVX1-NEXT: vzeroupper 649; AVX1-NEXT: retq 650; 651; AVX2-LABEL: trunc_add_const_v16i64_v16i8: 652; AVX2: # %bb.0: 653; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 654; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 655; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 656; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 657; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 658; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 659; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 660; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 661; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 662; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 663; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 664; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 665; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 666; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 667; AVX2-NEXT: vzeroupper 668; AVX2-NEXT: retq 669; 670; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 671; AVX512: # %bb.0: 672; AVX512-NEXT: vpmovqb %zmm1, %xmm1 673; AVX512-NEXT: vpmovqb %zmm0, %xmm0 674; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 675; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 676; AVX512-NEXT: vzeroupper 677; AVX512-NEXT: retq 678 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 679 %2 = trunc <16 x i64> %1 to <16 x i8> 680 ret <16 x i8> %2 681} 682 683define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 684; SSE-LABEL: trunc_add_const_v16i32_v16i8: 685; SSE: # %bb.0: 686; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 687; SSE-NEXT: pand %xmm4, %xmm3 688; SSE-NEXT: pand %xmm4, %xmm2 689; SSE-NEXT: packuswb %xmm3, %xmm2 690; SSE-NEXT: pand %xmm4, %xmm1 691; SSE-NEXT: pand %xmm4, %xmm0 692; SSE-NEXT: packuswb %xmm1, %xmm0 693; SSE-NEXT: packuswb %xmm2, %xmm0 694; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 695; SSE-NEXT: retq 696; 697; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 698; AVX1: # %bb.0: 699; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 700; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 701; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 702; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 703; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 704; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 705; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 706; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 707; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 708; AVX1-NEXT: vzeroupper 709; AVX1-NEXT: retq 710; 711; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 712; AVX2: # %bb.0: 713; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 714; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 715; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 716; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 717; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 718; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 719; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 720; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 721; AVX2-NEXT: vzeroupper 722; AVX2-NEXT: retq 723; 724; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 725; AVX512: # %bb.0: 726; AVX512-NEXT: vpmovdb %zmm0, %xmm0 727; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 728; AVX512-NEXT: vzeroupper 729; AVX512-NEXT: retq 730 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 731 %2 = trunc <16 x i32> %1 to <16 x i8> 732 ret <16 x i8> %2 733} 734 735define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 736; SSE-LABEL: trunc_add_const_v16i16_v16i8: 737; SSE: # %bb.0: 738; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 739; SSE-NEXT: pand %xmm2, %xmm1 740; SSE-NEXT: pand %xmm2, %xmm0 741; SSE-NEXT: packuswb %xmm1, %xmm0 742; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 743; SSE-NEXT: retq 744; 745; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 746; AVX1: # %bb.0: 747; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 748; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 749; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 750; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 751; AVX1-NEXT: vzeroupper 752; AVX1-NEXT: retq 753; 754; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 755; AVX2: # %bb.0: 756; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 757; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 758; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 759; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 760; AVX2-NEXT: vzeroupper 761; AVX2-NEXT: retq 762; 763; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 764; AVX512F: # %bb.0: 765; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 766; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 767; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 768; AVX512F-NEXT: vzeroupper 769; AVX512F-NEXT: retq 770; 771; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 772; AVX512BW: # %bb.0: 773; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 774; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 775; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 776; AVX512BW-NEXT: vzeroupper 777; AVX512BW-NEXT: retq 778; 779; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 780; AVX512DQ: # %bb.0: 781; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 782; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 783; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 784; AVX512DQ-NEXT: vzeroupper 785; AVX512DQ-NEXT: retq 786 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 787 %2 = trunc <16 x i16> %1 to <16 x i8> 788 ret <16 x i8> %2 789} 790 791; 792; sub 793; 794 795define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 796; SSE-LABEL: trunc_sub_v4i64_v4i32: 797; SSE: # %bb.0: 798; SSE-NEXT: psubq %xmm3, %xmm1 799; SSE-NEXT: psubq %xmm2, %xmm0 800; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 801; SSE-NEXT: retq 802; 803; AVX1-LABEL: trunc_sub_v4i64_v4i32: 804; AVX1: # %bb.0: 805; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 806; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 807; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 808; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 809; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 810; AVX1-NEXT: vzeroupper 811; AVX1-NEXT: retq 812; 813; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 814; AVX2-SLOW: # %bb.0: 815; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 816; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 817; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 818; AVX2-SLOW-NEXT: vzeroupper 819; AVX2-SLOW-NEXT: retq 820; 821; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: 822; AVX2-FAST-ALL: # %bb.0: 823; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 824; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 825; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 826; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 827; AVX2-FAST-ALL-NEXT: vzeroupper 828; AVX2-FAST-ALL-NEXT: retq 829; 830; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: 831; AVX2-FAST-PERLANE: # %bb.0: 832; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 833; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 834; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 835; AVX2-FAST-PERLANE-NEXT: vzeroupper 836; AVX2-FAST-PERLANE-NEXT: retq 837; 838; AVX512-LABEL: trunc_sub_v4i64_v4i32: 839; AVX512: # %bb.0: 840; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 841; AVX512-NEXT: vpmovqd %zmm0, %ymm0 842; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 843; AVX512-NEXT: vzeroupper 844; AVX512-NEXT: retq 845 %1 = sub <4 x i64> %a0, %a1 846 %2 = trunc <4 x i64> %1 to <4 x i32> 847 ret <4 x i32> %2 848} 849 850define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 851; SSE-LABEL: trunc_sub_v8i64_v8i16: 852; SSE: # %bb.0: 853; SSE-NEXT: psubq %xmm6, %xmm2 854; SSE-NEXT: psubq %xmm7, %xmm3 855; SSE-NEXT: psubq %xmm4, %xmm0 856; SSE-NEXT: psubq %xmm5, %xmm1 857; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 858; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 859; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 860; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 861; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 862; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 863; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 864; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 865; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 866; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 867; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 868; SSE-NEXT: retq 869; 870; AVX1-LABEL: trunc_sub_v8i64_v8i16: 871; AVX1: # %bb.0: 872; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 873; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 874; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 875; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 876; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 877; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 878; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 879; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 880; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 881; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 882; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 883; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 884; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 885; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 886; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 887; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 888; AVX1-NEXT: vzeroupper 889; AVX1-NEXT: retq 890; 891; AVX2-LABEL: trunc_sub_v8i64_v8i16: 892; AVX2: # %bb.0: 893; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 894; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 895; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 896; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 897; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 898; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 899; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 900; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 901; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 902; AVX2-NEXT: vzeroupper 903; AVX2-NEXT: retq 904; 905; AVX512-LABEL: trunc_sub_v8i64_v8i16: 906; AVX512: # %bb.0: 907; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 908; AVX512-NEXT: vpmovqw %zmm0, %xmm0 909; AVX512-NEXT: vzeroupper 910; AVX512-NEXT: retq 911 %1 = sub <8 x i64> %a0, %a1 912 %2 = trunc <8 x i64> %1 to <8 x i16> 913 ret <8 x i16> %2 914} 915 916define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 917; SSE-LABEL: trunc_sub_v8i32_v8i16: 918; SSE: # %bb.0: 919; SSE-NEXT: psubd %xmm2, %xmm0 920; SSE-NEXT: psubd %xmm3, %xmm1 921; SSE-NEXT: pslld $16, %xmm1 922; SSE-NEXT: psrad $16, %xmm1 923; SSE-NEXT: pslld $16, %xmm0 924; SSE-NEXT: psrad $16, %xmm0 925; SSE-NEXT: packssdw %xmm1, %xmm0 926; SSE-NEXT: retq 927; 928; AVX1-LABEL: trunc_sub_v8i32_v8i16: 929; AVX1: # %bb.0: 930; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 931; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 932; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 933; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 934; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 935; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 936; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 937; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 938; AVX1-NEXT: vzeroupper 939; AVX1-NEXT: retq 940; 941; AVX2-LABEL: trunc_sub_v8i32_v8i16: 942; AVX2: # %bb.0: 943; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 944; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 945; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 946; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 947; AVX2-NEXT: vzeroupper 948; AVX2-NEXT: retq 949; 950; AVX512-LABEL: trunc_sub_v8i32_v8i16: 951; AVX512: # %bb.0: 952; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 953; AVX512-NEXT: vpmovdw %zmm0, %ymm0 954; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 955; AVX512-NEXT: vzeroupper 956; AVX512-NEXT: retq 957 %1 = sub <8 x i32> %a0, %a1 958 %2 = trunc <8 x i32> %1 to <8 x i16> 959 ret <8 x i16> %2 960} 961 962define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 963; SSE-LABEL: trunc_sub_v16i64_v16i8: 964; SSE: # %bb.0: 965; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 966; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 967; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 968; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 969; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 970; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 971; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 972; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 973; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 974; SSE-NEXT: pand %xmm8, %xmm7 975; SSE-NEXT: pand %xmm8, %xmm6 976; SSE-NEXT: packuswb %xmm7, %xmm6 977; SSE-NEXT: pand %xmm8, %xmm5 978; SSE-NEXT: pand %xmm8, %xmm4 979; SSE-NEXT: packuswb %xmm5, %xmm4 980; SSE-NEXT: packuswb %xmm6, %xmm4 981; SSE-NEXT: pand %xmm8, %xmm3 982; SSE-NEXT: pand %xmm8, %xmm2 983; SSE-NEXT: packuswb %xmm3, %xmm2 984; SSE-NEXT: pand %xmm8, %xmm1 985; SSE-NEXT: pand %xmm8, %xmm0 986; SSE-NEXT: packuswb %xmm1, %xmm0 987; SSE-NEXT: packuswb %xmm2, %xmm0 988; SSE-NEXT: packuswb %xmm4, %xmm0 989; SSE-NEXT: retq 990; 991; AVX1-LABEL: trunc_sub_v16i64_v16i8: 992; AVX1: # %bb.0: 993; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 994; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 995; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 996; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 997; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 998; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 999; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1000; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 1001; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 1002; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1003; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1004; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1005; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 1006; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1007; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1008; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 1009; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1010; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1011; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1012; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1013; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1014; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1015; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1016; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1017; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1018; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1019; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1020; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1021; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1022; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1023; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1024; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1025; AVX1-NEXT: vzeroupper 1026; AVX1-NEXT: retq 1027; 1028; AVX2-LABEL: trunc_sub_v16i64_v16i8: 1029; AVX2: # %bb.0: 1030; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1031; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1032; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1033; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1034; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1035; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1036; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1037; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1038; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1039; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1040; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1041; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1042; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1043; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1044; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1045; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1046; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1047; AVX2-NEXT: vzeroupper 1048; AVX2-NEXT: retq 1049; 1050; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1051; AVX512: # %bb.0: 1052; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1053; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1054; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1055; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1056; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1057; AVX512-NEXT: vzeroupper 1058; AVX512-NEXT: retq 1059 %1 = sub <16 x i64> %a0, %a1 1060 %2 = trunc <16 x i64> %1 to <16 x i8> 1061 ret <16 x i8> %2 1062} 1063 1064define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1065; SSE-LABEL: trunc_sub_v16i32_v16i8: 1066; SSE: # %bb.0: 1067; SSE-NEXT: psubd %xmm4, %xmm0 1068; SSE-NEXT: psubd %xmm5, %xmm1 1069; SSE-NEXT: psubd %xmm6, %xmm2 1070; SSE-NEXT: psubd %xmm7, %xmm3 1071; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1072; SSE-NEXT: pand %xmm4, %xmm3 1073; SSE-NEXT: pand %xmm4, %xmm2 1074; SSE-NEXT: packuswb %xmm3, %xmm2 1075; SSE-NEXT: pand %xmm4, %xmm1 1076; SSE-NEXT: pand %xmm4, %xmm0 1077; SSE-NEXT: packuswb %xmm1, %xmm0 1078; SSE-NEXT: packuswb %xmm2, %xmm0 1079; SSE-NEXT: retq 1080; 1081; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1082; AVX1: # %bb.0: 1083; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1084; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1085; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1086; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1087; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1088; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1089; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1090; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1091; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1092; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1093; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1094; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1095; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1096; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1097; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1098; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1099; AVX1-NEXT: vzeroupper 1100; AVX1-NEXT: retq 1101; 1102; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1103; AVX2: # %bb.0: 1104; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1105; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1106; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1107; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1108; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1109; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1110; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1111; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1112; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1113; AVX2-NEXT: vzeroupper 1114; AVX2-NEXT: retq 1115; 1116; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1117; AVX512: # %bb.0: 1118; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1119; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1120; AVX512-NEXT: vzeroupper 1121; AVX512-NEXT: retq 1122 %1 = sub <16 x i32> %a0, %a1 1123 %2 = trunc <16 x i32> %1 to <16 x i8> 1124 ret <16 x i8> %2 1125} 1126 1127define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1128; SSE-LABEL: trunc_sub_v16i16_v16i8: 1129; SSE: # %bb.0: 1130; SSE-NEXT: psubw %xmm2, %xmm0 1131; SSE-NEXT: psubw %xmm3, %xmm1 1132; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1133; SSE-NEXT: pand %xmm2, %xmm1 1134; SSE-NEXT: pand %xmm2, %xmm0 1135; SSE-NEXT: packuswb %xmm1, %xmm0 1136; SSE-NEXT: retq 1137; 1138; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1139; AVX1: # %bb.0: 1140; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1141; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1142; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1143; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1144; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1145; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1146; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1147; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1148; AVX1-NEXT: vzeroupper 1149; AVX1-NEXT: retq 1150; 1151; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1152; AVX2: # %bb.0: 1153; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1154; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1155; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1156; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1157; AVX2-NEXT: vzeroupper 1158; AVX2-NEXT: retq 1159; 1160; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1161; AVX512F: # %bb.0: 1162; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1163; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1164; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1165; AVX512F-NEXT: vzeroupper 1166; AVX512F-NEXT: retq 1167; 1168; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1169; AVX512BW: # %bb.0: 1170; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1171; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1172; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1173; AVX512BW-NEXT: vzeroupper 1174; AVX512BW-NEXT: retq 1175; 1176; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1177; AVX512DQ: # %bb.0: 1178; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1179; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1180; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1181; AVX512DQ-NEXT: vzeroupper 1182; AVX512DQ-NEXT: retq 1183 %1 = sub <16 x i16> %a0, %a1 1184 %2 = trunc <16 x i16> %1 to <16 x i8> 1185 ret <16 x i8> %2 1186} 1187 1188define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { 1189; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: 1190; SSE: # %bb.0: 1191; SSE-NEXT: psubb %xmm1, %xmm0 1192; SSE-NEXT: retq 1193; 1194; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: 1195; AVX: # %bb.0: 1196; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1197; AVX-NEXT: retq 1198 %a = zext <16 x i8> %x to <16 x i16> 1199 %b = zext <16 x i8> %y to <16 x i16> 1200 %c = sub <16 x i16> %a, %b 1201 %d = trunc <16 x i16> %c to <16 x i8> 1202 ret <16 x i8> %d 1203} 1204 1205; 1206; sub to constant 1207; 1208 1209define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1210; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1211; SSE: # %bb.0: 1212; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1213; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1214; SSE-NEXT: retq 1215; 1216; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1217; AVX1: # %bb.0: 1218; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1219; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1220; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1221; AVX1-NEXT: vzeroupper 1222; AVX1-NEXT: retq 1223; 1224; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1225; AVX2-SLOW: # %bb.0: 1226; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1227; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1228; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1229; AVX2-SLOW-NEXT: vzeroupper 1230; AVX2-SLOW-NEXT: retq 1231; 1232; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: 1233; AVX2-FAST-ALL: # %bb.0: 1234; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 1235; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 1236; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1237; AVX2-FAST-ALL-NEXT: vzeroupper 1238; AVX2-FAST-ALL-NEXT: retq 1239; 1240; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: 1241; AVX2-FAST-PERLANE: # %bb.0: 1242; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 1243; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1244; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1245; AVX2-FAST-PERLANE-NEXT: vzeroupper 1246; AVX2-FAST-PERLANE-NEXT: retq 1247; 1248; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1249; AVX512: # %bb.0: 1250; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1251; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1252; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1253; AVX512-NEXT: vzeroupper 1254; AVX512-NEXT: retq 1255 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1256 %2 = trunc <4 x i64> %1 to <4 x i32> 1257 ret <4 x i32> %2 1258} 1259 1260define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1261; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1262; SSE: # %bb.0: 1263; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1264; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1265; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1266; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1267; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1268; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1269; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1270; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1271; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1272; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1273; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1274; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1275; SSE-NEXT: retq 1276; 1277; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1278; AVX1: # %bb.0: 1279; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 1280; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1281; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1282; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1283; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1284; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1285; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1286; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1287; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1288; AVX1-NEXT: vzeroupper 1289; AVX1-NEXT: retq 1290; 1291; AVX2-LABEL: trunc_sub_const_v8i64_v8i16: 1292; AVX2: # %bb.0: 1293; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1294; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1295; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1296; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1297; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1298; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1299; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1300; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1301; AVX2-NEXT: vzeroupper 1302; AVX2-NEXT: retq 1303; 1304; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1305; AVX512: # %bb.0: 1306; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1307; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1308; AVX512-NEXT: vzeroupper 1309; AVX512-NEXT: retq 1310 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1311 %2 = trunc <8 x i64> %1 to <8 x i16> 1312 ret <8 x i16> %2 1313} 1314 1315define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1316; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1317; SSE: # %bb.0: 1318; SSE-NEXT: pslld $16, %xmm1 1319; SSE-NEXT: psrad $16, %xmm1 1320; SSE-NEXT: pslld $16, %xmm0 1321; SSE-NEXT: psrad $16, %xmm0 1322; SSE-NEXT: packssdw %xmm1, %xmm0 1323; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1324; SSE-NEXT: retq 1325; 1326; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1327; AVX1: # %bb.0: 1328; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1329; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1330; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1331; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1332; AVX1-NEXT: vzeroupper 1333; AVX1-NEXT: retq 1334; 1335; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1336; AVX2: # %bb.0: 1337; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1338; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1339; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1340; AVX2-NEXT: vzeroupper 1341; AVX2-NEXT: retq 1342; 1343; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1344; AVX512: # %bb.0: 1345; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1346; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1347; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1348; AVX512-NEXT: vzeroupper 1349; AVX512-NEXT: retq 1350 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1351 %2 = trunc <8 x i32> %1 to <8 x i16> 1352 ret <8 x i16> %2 1353} 1354 1355define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1356; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1357; SSE: # %bb.0: 1358; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1359; SSE-NEXT: pand %xmm8, %xmm7 1360; SSE-NEXT: pand %xmm8, %xmm6 1361; SSE-NEXT: packuswb %xmm7, %xmm6 1362; SSE-NEXT: pand %xmm8, %xmm5 1363; SSE-NEXT: pand %xmm8, %xmm4 1364; SSE-NEXT: packuswb %xmm5, %xmm4 1365; SSE-NEXT: packuswb %xmm6, %xmm4 1366; SSE-NEXT: pand %xmm8, %xmm3 1367; SSE-NEXT: pand %xmm8, %xmm2 1368; SSE-NEXT: packuswb %xmm3, %xmm2 1369; SSE-NEXT: pand %xmm8, %xmm1 1370; SSE-NEXT: pand %xmm8, %xmm0 1371; SSE-NEXT: packuswb %xmm1, %xmm0 1372; SSE-NEXT: packuswb %xmm2, %xmm0 1373; SSE-NEXT: packuswb %xmm4, %xmm0 1374; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1375; SSE-NEXT: retq 1376; 1377; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1378; AVX1: # %bb.0: 1379; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 1380; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1381; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1382; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1383; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1384; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1385; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1386; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1387; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1388; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1389; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1390; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1391; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1392; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1393; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1394; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1395; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1396; AVX1-NEXT: vzeroupper 1397; AVX1-NEXT: retq 1398; 1399; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: 1400; AVX2: # %bb.0: 1401; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1402; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1403; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1404; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1405; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1406; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1407; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1408; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1409; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1410; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1411; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1412; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1413; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1414; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1415; AVX2-NEXT: vzeroupper 1416; AVX2-NEXT: retq 1417; 1418; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1419; AVX512: # %bb.0: 1420; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1421; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1422; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1423; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1424; AVX512-NEXT: vzeroupper 1425; AVX512-NEXT: retq 1426 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1427 %2 = trunc <16 x i64> %1 to <16 x i8> 1428 ret <16 x i8> %2 1429} 1430 1431define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1432; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1433; SSE: # %bb.0: 1434; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1435; SSE-NEXT: pand %xmm4, %xmm3 1436; SSE-NEXT: pand %xmm4, %xmm2 1437; SSE-NEXT: packuswb %xmm3, %xmm2 1438; SSE-NEXT: pand %xmm4, %xmm1 1439; SSE-NEXT: pand %xmm4, %xmm0 1440; SSE-NEXT: packuswb %xmm1, %xmm0 1441; SSE-NEXT: packuswb %xmm2, %xmm0 1442; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1443; SSE-NEXT: retq 1444; 1445; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1446; AVX1: # %bb.0: 1447; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1448; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1449; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1450; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1451; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1452; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1453; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1454; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1455; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1456; AVX1-NEXT: vzeroupper 1457; AVX1-NEXT: retq 1458; 1459; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1460; AVX2: # %bb.0: 1461; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1462; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1463; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1464; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1465; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1466; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1467; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1468; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1469; AVX2-NEXT: vzeroupper 1470; AVX2-NEXT: retq 1471; 1472; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1473; AVX512: # %bb.0: 1474; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1475; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1476; AVX512-NEXT: vzeroupper 1477; AVX512-NEXT: retq 1478 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1479 %2 = trunc <16 x i32> %1 to <16 x i8> 1480 ret <16 x i8> %2 1481} 1482 1483define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1484; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1485; SSE: # %bb.0: 1486; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1487; SSE-NEXT: pand %xmm2, %xmm1 1488; SSE-NEXT: pand %xmm2, %xmm0 1489; SSE-NEXT: packuswb %xmm1, %xmm0 1490; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1491; SSE-NEXT: retq 1492; 1493; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1494; AVX1: # %bb.0: 1495; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1496; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1497; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1498; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1499; AVX1-NEXT: vzeroupper 1500; AVX1-NEXT: retq 1501; 1502; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1503; AVX2: # %bb.0: 1504; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1505; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1506; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1507; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1508; AVX2-NEXT: vzeroupper 1509; AVX2-NEXT: retq 1510; 1511; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1512; AVX512F: # %bb.0: 1513; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1514; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1515; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1516; AVX512F-NEXT: vzeroupper 1517; AVX512F-NEXT: retq 1518; 1519; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1520; AVX512BW: # %bb.0: 1521; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1522; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1523; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1524; AVX512BW-NEXT: vzeroupper 1525; AVX512BW-NEXT: retq 1526; 1527; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1528; AVX512DQ: # %bb.0: 1529; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1530; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1531; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1532; AVX512DQ-NEXT: vzeroupper 1533; AVX512DQ-NEXT: retq 1534 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1535 %2 = trunc <16 x i16> %1 to <16 x i8> 1536 ret <16 x i8> %2 1537} 1538 1539define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { 1540; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1541; SSE: # %bb.0: 1542; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1543; SSE-NEXT: retq 1544; 1545; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1546; AVX: # %bb.0: 1547; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1548; AVX-NEXT: retq 1549 %a = zext <16 x i8> %x to <16 x i16> 1550 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1551 %c = trunc <16 x i16> %b to <16 x i8> 1552 ret <16 x i8> %c 1553} 1554 1555define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { 1556; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1557; SSE: # %bb.0: 1558; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1559; SSE-NEXT: psubb %xmm0, %xmm1 1560; SSE-NEXT: movdqa %xmm1, %xmm0 1561; SSE-NEXT: retq 1562; 1563; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1564; AVX: # %bb.0: 1565; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1566; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1567; AVX-NEXT: retq 1568 %a = zext <16 x i8> %x to <16 x i16> 1569 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1570 %c = trunc <16 x i16> %b to <16 x i8> 1571 ret <16 x i8> %c 1572} 1573 1574; 1575; mul 1576; 1577 1578define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1579; SSE-LABEL: trunc_mul_v4i64_v4i32: 1580; SSE: # %bb.0: 1581; SSE-NEXT: pmuludq %xmm3, %xmm1 1582; SSE-NEXT: pmuludq %xmm2, %xmm0 1583; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1584; SSE-NEXT: retq 1585; 1586; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1587; AVX1: # %bb.0: 1588; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1589; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1590; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1591; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1592; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1593; AVX1-NEXT: vzeroupper 1594; AVX1-NEXT: retq 1595; 1596; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1597; AVX2-SLOW: # %bb.0: 1598; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1599; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1600; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1601; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1602; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1603; AVX2-SLOW-NEXT: vzeroupper 1604; AVX2-SLOW-NEXT: retq 1605; 1606; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: 1607; AVX2-FAST-ALL: # %bb.0: 1608; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1609; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 1610; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 1611; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1612; AVX2-FAST-ALL-NEXT: vzeroupper 1613; AVX2-FAST-ALL-NEXT: retq 1614; 1615; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: 1616; AVX2-FAST-PERLANE: # %bb.0: 1617; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 1618; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1619; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 1620; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1621; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1622; AVX2-FAST-PERLANE-NEXT: vzeroupper 1623; AVX2-FAST-PERLANE-NEXT: retq 1624; 1625; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1626; AVX512F: # %bb.0: 1627; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1628; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1629; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1630; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1631; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1632; AVX512F-NEXT: vzeroupper 1633; AVX512F-NEXT: retq 1634; 1635; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1636; AVX512BW: # %bb.0: 1637; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1638; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1639; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1640; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1641; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1642; AVX512BW-NEXT: vzeroupper 1643; AVX512BW-NEXT: retq 1644; 1645; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1646; AVX512DQ: # %bb.0: 1647; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1648; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1649; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1650; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1651; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1652; AVX512DQ-NEXT: vzeroupper 1653; AVX512DQ-NEXT: retq 1654 %1 = mul <4 x i64> %a0, %a1 1655 %2 = trunc <4 x i64> %1 to <4 x i32> 1656 ret <4 x i32> %2 1657} 1658 1659define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1660; SSE-LABEL: trunc_mul_v8i64_v8i16: 1661; SSE: # %bb.0: 1662; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1663; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 1664; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1665; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1666; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1667; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 1668; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] 1669; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1670; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] 1671; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1672; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] 1673; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1674; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1675; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1676; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1677; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1678; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1679; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1680; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1681; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1682; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1683; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1684; SSE-NEXT: pmullw %xmm6, %xmm0 1685; SSE-NEXT: retq 1686; 1687; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1688; AVX1: # %bb.0: 1689; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 1690; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1691; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1692; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1693; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1694; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1695; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1696; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1697; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1698; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1699; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1700; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1701; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1702; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1703; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1704; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1705; AVX1-NEXT: vzeroupper 1706; AVX1-NEXT: retq 1707; 1708; AVX2-LABEL: trunc_mul_v8i64_v8i16: 1709; AVX2: # %bb.0: 1710; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1711; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15] 1712; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15] 1713; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1714; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1715; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1716; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15] 1717; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15] 1718; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1719; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1720; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1721; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1722; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1723; AVX2-NEXT: vzeroupper 1724; AVX2-NEXT: retq 1725; 1726; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1727; AVX512F: # %bb.0: 1728; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1729; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1730; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1731; AVX512F-NEXT: vzeroupper 1732; AVX512F-NEXT: retq 1733; 1734; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1735; AVX512BW: # %bb.0: 1736; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1737; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1738; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1739; AVX512BW-NEXT: vzeroupper 1740; AVX512BW-NEXT: retq 1741; 1742; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1743; AVX512DQ: # %bb.0: 1744; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1745; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1746; AVX512DQ-NEXT: vzeroupper 1747; AVX512DQ-NEXT: retq 1748 %1 = mul <8 x i64> %a0, %a1 1749 %2 = trunc <8 x i64> %1 to <8 x i16> 1750 ret <8 x i16> %2 1751} 1752 1753define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1754; SSE-LABEL: trunc_mul_v8i32_v8i16: 1755; SSE: # %bb.0: 1756; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1757; SSE-NEXT: pmuludq %xmm2, %xmm0 1758; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1759; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1760; SSE-NEXT: pmuludq %xmm4, %xmm2 1761; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1762; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1763; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1764; SSE-NEXT: pmuludq %xmm3, %xmm1 1765; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1766; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1767; SSE-NEXT: pmuludq %xmm2, %xmm3 1768; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1769; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1770; SSE-NEXT: pslld $16, %xmm1 1771; SSE-NEXT: psrad $16, %xmm1 1772; SSE-NEXT: pslld $16, %xmm0 1773; SSE-NEXT: psrad $16, %xmm0 1774; SSE-NEXT: packssdw %xmm1, %xmm0 1775; SSE-NEXT: retq 1776; 1777; AVX1-LABEL: trunc_mul_v8i32_v8i16: 1778; AVX1: # %bb.0: 1779; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1780; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1781; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1782; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1783; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 1784; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] 1785; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4],xmm1[5],xmm2[6],xmm1[7] 1786; AVX1-NEXT: vpackusdw %xmm0, %xmm1, %xmm0 1787; AVX1-NEXT: vzeroupper 1788; AVX1-NEXT: retq 1789; 1790; AVX2-LABEL: trunc_mul_v8i32_v8i16: 1791; AVX2: # %bb.0: 1792; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1793; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1794; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1795; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1796; AVX2-NEXT: vzeroupper 1797; AVX2-NEXT: retq 1798; 1799; AVX512-LABEL: trunc_mul_v8i32_v8i16: 1800; AVX512: # %bb.0: 1801; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1802; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1803; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1804; AVX512-NEXT: vzeroupper 1805; AVX512-NEXT: retq 1806 %1 = mul <8 x i32> %a0, %a1 1807 %2 = trunc <8 x i32> %1 to <8 x i16> 1808 ret <8 x i16> %2 1809} 1810 1811define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1812; SSE-LABEL: trunc_mul_v16i64_v16i8: 1813; SSE: # %bb.0: 1814; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 1815; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 1816; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 1817; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 1818; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 1819; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 1820; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 1821; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 1822; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1823; SSE-NEXT: pand %xmm8, %xmm7 1824; SSE-NEXT: pand %xmm8, %xmm6 1825; SSE-NEXT: packuswb %xmm7, %xmm6 1826; SSE-NEXT: pand %xmm8, %xmm5 1827; SSE-NEXT: pand %xmm8, %xmm4 1828; SSE-NEXT: packuswb %xmm5, %xmm4 1829; SSE-NEXT: packuswb %xmm6, %xmm4 1830; SSE-NEXT: pand %xmm8, %xmm3 1831; SSE-NEXT: pand %xmm8, %xmm2 1832; SSE-NEXT: packuswb %xmm3, %xmm2 1833; SSE-NEXT: pand %xmm8, %xmm1 1834; SSE-NEXT: pand %xmm8, %xmm0 1835; SSE-NEXT: packuswb %xmm1, %xmm0 1836; SSE-NEXT: packuswb %xmm2, %xmm0 1837; SSE-NEXT: packuswb %xmm4, %xmm0 1838; SSE-NEXT: retq 1839; 1840; AVX1-LABEL: trunc_mul_v16i64_v16i8: 1841; AVX1: # %bb.0: 1842; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 1843; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1844; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1845; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 1846; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 1847; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1848; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1849; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 1850; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 1851; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1852; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1853; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 1854; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 1855; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1856; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1857; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 1858; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1859; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1860; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1861; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1862; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1863; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1864; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1865; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1866; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1867; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1868; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1869; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1870; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1871; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1872; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1873; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1874; AVX1-NEXT: vzeroupper 1875; AVX1-NEXT: retq 1876; 1877; AVX2-LABEL: trunc_mul_v16i64_v16i8: 1878; AVX2: # %bb.0: 1879; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 1880; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 1881; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 1882; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 1883; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1884; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1885; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1886; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1887; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1888; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1889; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1890; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1891; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1892; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1893; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1894; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1895; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1896; AVX2-NEXT: vzeroupper 1897; AVX2-NEXT: retq 1898; 1899; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 1900; AVX512F: # %bb.0: 1901; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1902; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1903; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 1904; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1905; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1906; AVX512F-NEXT: vzeroupper 1907; AVX512F-NEXT: retq 1908; 1909; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 1910; AVX512BW: # %bb.0: 1911; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1912; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1913; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 1914; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1915; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1916; AVX512BW-NEXT: vzeroupper 1917; AVX512BW-NEXT: retq 1918; 1919; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 1920; AVX512DQ: # %bb.0: 1921; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 1922; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 1923; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 1924; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 1925; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1926; AVX512DQ-NEXT: vzeroupper 1927; AVX512DQ-NEXT: retq 1928 %1 = mul <16 x i64> %a0, %a1 1929 %2 = trunc <16 x i64> %1 to <16 x i8> 1930 ret <16 x i8> %2 1931} 1932 1933define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1934; SSE-LABEL: trunc_mul_v16i32_v16i8: 1935; SSE: # %bb.0: 1936; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 1937; SSE-NEXT: pmuludq %xmm4, %xmm0 1938; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1939; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1940; SSE-NEXT: pmuludq %xmm8, %xmm4 1941; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1942; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1943; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1944; SSE-NEXT: pmuludq %xmm5, %xmm1 1945; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1946; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1947; SSE-NEXT: pmuludq %xmm4, %xmm5 1948; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1949; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1950; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 1951; SSE-NEXT: pmuludq %xmm6, %xmm2 1952; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1953; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 1954; SSE-NEXT: pmuludq %xmm4, %xmm5 1955; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1956; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 1957; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1958; SSE-NEXT: pmuludq %xmm7, %xmm3 1959; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1960; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1961; SSE-NEXT: pmuludq %xmm4, %xmm5 1962; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1963; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1964; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1965; SSE-NEXT: pand %xmm4, %xmm3 1966; SSE-NEXT: pand %xmm4, %xmm2 1967; SSE-NEXT: packuswb %xmm3, %xmm2 1968; SSE-NEXT: pand %xmm4, %xmm1 1969; SSE-NEXT: pand %xmm4, %xmm0 1970; SSE-NEXT: packuswb %xmm1, %xmm0 1971; SSE-NEXT: packuswb %xmm2, %xmm0 1972; SSE-NEXT: retq 1973; 1974; AVX1-LABEL: trunc_mul_v16i32_v16i8: 1975; AVX1: # %bb.0: 1976; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 1977; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1978; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1979; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 1980; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 1981; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1982; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1983; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1984; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1985; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1986; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1987; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1988; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1989; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1990; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1991; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1992; AVX1-NEXT: vzeroupper 1993; AVX1-NEXT: retq 1994; 1995; AVX2-LABEL: trunc_mul_v16i32_v16i8: 1996; AVX2: # %bb.0: 1997; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 1998; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 1999; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2000; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2001; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2002; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2003; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2004; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2005; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2006; AVX2-NEXT: vzeroupper 2007; AVX2-NEXT: retq 2008; 2009; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2010; AVX512: # %bb.0: 2011; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2012; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2013; AVX512-NEXT: vzeroupper 2014; AVX512-NEXT: retq 2015 %1 = mul <16 x i32> %a0, %a1 2016 %2 = trunc <16 x i32> %1 to <16 x i8> 2017 ret <16 x i8> %2 2018} 2019 2020define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2021; SSE-LABEL: trunc_mul_v16i16_v16i8: 2022; SSE: # %bb.0: 2023; SSE-NEXT: pmullw %xmm2, %xmm0 2024; SSE-NEXT: pmullw %xmm3, %xmm1 2025; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2026; SSE-NEXT: pand %xmm2, %xmm1 2027; SSE-NEXT: pand %xmm2, %xmm0 2028; SSE-NEXT: packuswb %xmm1, %xmm0 2029; SSE-NEXT: retq 2030; 2031; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2032; AVX1: # %bb.0: 2033; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2034; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2035; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2036; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2037; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2038; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 2039; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 2040; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2041; AVX1-NEXT: vzeroupper 2042; AVX1-NEXT: retq 2043; 2044; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2045; AVX2: # %bb.0: 2046; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2047; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2048; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2049; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2050; AVX2-NEXT: vzeroupper 2051; AVX2-NEXT: retq 2052; 2053; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2054; AVX512F: # %bb.0: 2055; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2056; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2057; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2058; AVX512F-NEXT: vzeroupper 2059; AVX512F-NEXT: retq 2060; 2061; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2062; AVX512BW: # %bb.0: 2063; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2064; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2065; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2066; AVX512BW-NEXT: vzeroupper 2067; AVX512BW-NEXT: retq 2068; 2069; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2070; AVX512DQ: # %bb.0: 2071; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2072; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2073; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2074; AVX512DQ-NEXT: vzeroupper 2075; AVX512DQ-NEXT: retq 2076 %1 = mul <16 x i16> %a0, %a1 2077 %2 = trunc <16 x i16> %1 to <16 x i8> 2078 ret <16 x i8> %2 2079} 2080 2081define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2082; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2083; SSE: # %bb.0: 2084; SSE-NEXT: pxor %xmm3, %xmm3 2085; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2086; SSE-NEXT: pslld $16, %xmm2 2087; SSE-NEXT: psrad $16, %xmm2 2088; SSE-NEXT: pslld $16, %xmm1 2089; SSE-NEXT: psrad $16, %xmm1 2090; SSE-NEXT: packssdw %xmm2, %xmm1 2091; SSE-NEXT: pmullw %xmm1, %xmm0 2092; SSE-NEXT: retq 2093; 2094; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2095; AVX1: # %bb.0: 2096; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2097; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2098; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 2099; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2100; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2101; AVX1-NEXT: vzeroupper 2102; AVX1-NEXT: retq 2103; 2104; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2105; AVX2: # %bb.0: 2106; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2107; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2108; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2109; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2110; AVX2-NEXT: vzeroupper 2111; AVX2-NEXT: retq 2112; 2113; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2114; AVX512: # %bb.0: 2115; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2116; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2117; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2118; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2119; AVX512-NEXT: vzeroupper 2120; AVX512-NEXT: retq 2121 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2122 %2 = zext <8 x i8> %1 to <8 x i32> 2123 %3 = mul <8 x i32> %2, %a1 2124 %4 = trunc <8 x i32> %3 to <8 x i16> 2125 ret <8 x i16> %4 2126} 2127 2128; 2129; mul to constant 2130; 2131 2132define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2133; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2134; SSE: # %bb.0: 2135; SSE-NEXT: xorps %xmm2, %xmm2 2136; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2137; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2138; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2139; SSE-NEXT: movaps %xmm2, %xmm0 2140; SSE-NEXT: retq 2141; 2142; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2143; AVX1: # %bb.0: 2144; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2145; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2146; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2147; AVX1-NEXT: vzeroupper 2148; AVX1-NEXT: retq 2149; 2150; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2151; AVX2-SLOW: # %bb.0: 2152; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2153; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2154; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2155; AVX2-SLOW-NEXT: vzeroupper 2156; AVX2-SLOW-NEXT: retq 2157; 2158; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: 2159; AVX2-FAST-ALL: # %bb.0: 2160; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2161; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2162; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2163; AVX2-FAST-ALL-NEXT: vzeroupper 2164; AVX2-FAST-ALL-NEXT: retq 2165; 2166; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: 2167; AVX2-FAST-PERLANE: # %bb.0: 2168; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2169; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2170; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2171; AVX2-FAST-PERLANE-NEXT: vzeroupper 2172; AVX2-FAST-PERLANE-NEXT: retq 2173; 2174; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2175; AVX512: # %bb.0: 2176; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2177; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2178; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2179; AVX512-NEXT: vzeroupper 2180; AVX512-NEXT: retq 2181 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2182 %2 = trunc <4 x i64> %1 to <4 x i32> 2183 ret <4 x i32> %2 2184} 2185 2186define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2187; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2188; SSE: # %bb.0: 2189; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2190; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2191; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2192; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2193; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2194; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2195; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2196; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2197; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2198; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2199; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2200; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2201; SSE-NEXT: retq 2202; 2203; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2204; AVX1: # %bb.0: 2205; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2206; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2207; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2208; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2209; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2210; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2211; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2212; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2213; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2214; AVX1-NEXT: vzeroupper 2215; AVX1-NEXT: retq 2216; 2217; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: 2218; AVX2: # %bb.0: 2219; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2220; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2221; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2222; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2223; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2224; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2225; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2226; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2227; AVX2-NEXT: vzeroupper 2228; AVX2-NEXT: retq 2229; 2230; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2231; AVX512: # %bb.0: 2232; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2233; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2234; AVX512-NEXT: vzeroupper 2235; AVX512-NEXT: retq 2236 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2237 %2 = trunc <8 x i64> %1 to <8 x i16> 2238 ret <8 x i16> %2 2239} 2240 2241define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2242; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2243; SSE: # %bb.0: 2244; SSE-NEXT: pslld $16, %xmm1 2245; SSE-NEXT: psrad $16, %xmm1 2246; SSE-NEXT: pslld $16, %xmm0 2247; SSE-NEXT: psrad $16, %xmm0 2248; SSE-NEXT: packssdw %xmm1, %xmm0 2249; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2250; SSE-NEXT: retq 2251; 2252; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2253; AVX1: # %bb.0: 2254; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2255; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2256; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2257; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2258; AVX1-NEXT: vzeroupper 2259; AVX1-NEXT: retq 2260; 2261; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2262; AVX2: # %bb.0: 2263; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2264; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2265; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2266; AVX2-NEXT: vzeroupper 2267; AVX2-NEXT: retq 2268; 2269; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2270; AVX512: # %bb.0: 2271; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2272; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2273; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2274; AVX512-NEXT: vzeroupper 2275; AVX512-NEXT: retq 2276 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2277 %2 = trunc <8 x i32> %1 to <8 x i16> 2278 ret <8 x i16> %2 2279} 2280 2281define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2282; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2283; SSE: # %bb.0: 2284; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2285; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2286; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2287; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2288; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 2289; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 2290; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 2291; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2292; SSE-NEXT: pand %xmm8, %xmm7 2293; SSE-NEXT: pand %xmm8, %xmm6 2294; SSE-NEXT: packuswb %xmm7, %xmm6 2295; SSE-NEXT: pand %xmm8, %xmm5 2296; SSE-NEXT: pand %xmm8, %xmm4 2297; SSE-NEXT: packuswb %xmm5, %xmm4 2298; SSE-NEXT: packuswb %xmm6, %xmm4 2299; SSE-NEXT: pand %xmm8, %xmm3 2300; SSE-NEXT: pand %xmm8, %xmm2 2301; SSE-NEXT: packuswb %xmm3, %xmm2 2302; SSE-NEXT: pand %xmm8, %xmm1 2303; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2304; SSE-NEXT: packuswb %xmm1, %xmm0 2305; SSE-NEXT: packuswb %xmm2, %xmm0 2306; SSE-NEXT: packuswb %xmm4, %xmm0 2307; SSE-NEXT: retq 2308; 2309; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2310; AVX1: # %bb.0: 2311; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 2312; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2313; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2314; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 2315; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2316; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2317; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 2318; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2319; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2320; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 2321; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2322; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2323; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 2324; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2325; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2326; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 2327; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2328; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 2329; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 2330; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2331; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2332; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2333; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2334; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2335; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2336; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2337; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2338; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2339; AVX1-NEXT: vzeroupper 2340; AVX1-NEXT: retq 2341; 2342; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: 2343; AVX2: # %bb.0: 2344; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2345; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2346; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2347; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2348; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 2349; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2350; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2351; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2352; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2353; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 2354; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2355; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2356; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2357; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2358; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2359; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2360; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2361; AVX2-NEXT: vzeroupper 2362; AVX2-NEXT: retq 2363; 2364; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: 2365; AVX512F: # %bb.0: 2366; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2367; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2368; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2369; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2370; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2371; AVX512F-NEXT: vzeroupper 2372; AVX512F-NEXT: retq 2373; 2374; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: 2375; AVX512BW: # %bb.0: 2376; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2377; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2378; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2379; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2380; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2381; AVX512BW-NEXT: vzeroupper 2382; AVX512BW-NEXT: retq 2383; 2384; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: 2385; AVX512DQ: # %bb.0: 2386; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2387; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2388; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2389; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2390; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2391; AVX512DQ-NEXT: vzeroupper 2392; AVX512DQ-NEXT: retq 2393 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2394 %2 = trunc <16 x i64> %1 to <16 x i8> 2395 ret <16 x i8> %2 2396} 2397 2398define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2399; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2400; SSE: # %bb.0: 2401; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2402; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2403; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2404; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2405; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2406; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2407; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2408; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2409; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2410; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2411; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2412; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2413; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2414; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2415; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2416; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2417; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2418; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2419; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2420; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2421; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2422; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2423; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2424; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2425; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2426; SSE-NEXT: pand %xmm4, %xmm3 2427; SSE-NEXT: pand %xmm4, %xmm2 2428; SSE-NEXT: packuswb %xmm3, %xmm2 2429; SSE-NEXT: pand %xmm4, %xmm1 2430; SSE-NEXT: pand %xmm4, %xmm0 2431; SSE-NEXT: packuswb %xmm1, %xmm0 2432; SSE-NEXT: packuswb %xmm2, %xmm0 2433; SSE-NEXT: retq 2434; 2435; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2436; AVX1: # %bb.0: 2437; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2438; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2439; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2440; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 2441; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2442; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2443; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] 2444; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2445; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2446; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2447; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2448; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2449; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2450; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2451; AVX1-NEXT: vzeroupper 2452; AVX1-NEXT: retq 2453; 2454; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2455; AVX2: # %bb.0: 2456; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2457; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2458; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2459; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2460; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2461; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2462; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2463; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2464; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2465; AVX2-NEXT: vzeroupper 2466; AVX2-NEXT: retq 2467; 2468; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2469; AVX512: # %bb.0: 2470; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2471; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2472; AVX512-NEXT: vzeroupper 2473; AVX512-NEXT: retq 2474 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2475 %2 = trunc <16 x i32> %1 to <16 x i8> 2476 ret <16 x i8> %2 2477} 2478 2479define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2480; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2481; SSE: # %bb.0: 2482; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2483; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2484; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2485; SSE-NEXT: pand %xmm2, %xmm1 2486; SSE-NEXT: pand %xmm2, %xmm0 2487; SSE-NEXT: packuswb %xmm1, %xmm0 2488; SSE-NEXT: retq 2489; 2490; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2491; AVX1: # %bb.0: 2492; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2493; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2494; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2495; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2496; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2497; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2498; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2499; AVX1-NEXT: vzeroupper 2500; AVX1-NEXT: retq 2501; 2502; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2503; AVX2: # %bb.0: 2504; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2505; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2506; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2507; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2508; AVX2-NEXT: vzeroupper 2509; AVX2-NEXT: retq 2510; 2511; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2512; AVX512F: # %bb.0: 2513; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2514; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2515; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2516; AVX512F-NEXT: vzeroupper 2517; AVX512F-NEXT: retq 2518; 2519; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2520; AVX512BW: # %bb.0: 2521; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2522; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2523; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2524; AVX512BW-NEXT: vzeroupper 2525; AVX512BW-NEXT: retq 2526; 2527; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 2528; AVX512DQ: # %bb.0: 2529; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2530; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2531; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2532; AVX512DQ-NEXT: vzeroupper 2533; AVX512DQ-NEXT: retq 2534 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2535 %2 = trunc <16 x i16> %1 to <16 x i8> 2536 ret <16 x i8> %2 2537} 2538 2539; 2540; and 2541; 2542 2543define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2544; SSE-LABEL: trunc_and_v4i64_v4i32: 2545; SSE: # %bb.0: 2546; SSE-NEXT: andps %xmm3, %xmm1 2547; SSE-NEXT: andps %xmm2, %xmm0 2548; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2549; SSE-NEXT: retq 2550; 2551; AVX1-LABEL: trunc_and_v4i64_v4i32: 2552; AVX1: # %bb.0: 2553; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2554; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2555; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2556; AVX1-NEXT: vzeroupper 2557; AVX1-NEXT: retq 2558; 2559; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 2560; AVX2-SLOW: # %bb.0: 2561; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 2562; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2563; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2564; AVX2-SLOW-NEXT: vzeroupper 2565; AVX2-SLOW-NEXT: retq 2566; 2567; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: 2568; AVX2-FAST-ALL: # %bb.0: 2569; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 2570; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2571; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2572; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2573; AVX2-FAST-ALL-NEXT: vzeroupper 2574; AVX2-FAST-ALL-NEXT: retq 2575; 2576; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: 2577; AVX2-FAST-PERLANE: # %bb.0: 2578; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 2579; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2580; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2581; AVX2-FAST-PERLANE-NEXT: vzeroupper 2582; AVX2-FAST-PERLANE-NEXT: retq 2583; 2584; AVX512-LABEL: trunc_and_v4i64_v4i32: 2585; AVX512: # %bb.0: 2586; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2587; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2588; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2589; AVX512-NEXT: vzeroupper 2590; AVX512-NEXT: retq 2591 %1 = and <4 x i64> %a0, %a1 2592 %2 = trunc <4 x i64> %1 to <4 x i32> 2593 ret <4 x i32> %2 2594} 2595 2596define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 2597; SSE-LABEL: trunc_and_v8i64_v8i16: 2598; SSE: # %bb.0: 2599; SSE-NEXT: pand %xmm6, %xmm2 2600; SSE-NEXT: pand %xmm7, %xmm3 2601; SSE-NEXT: pand %xmm4, %xmm0 2602; SSE-NEXT: pand %xmm5, %xmm1 2603; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2604; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2605; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2606; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2607; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2608; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2609; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2610; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2611; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2612; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2613; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2614; SSE-NEXT: retq 2615; 2616; AVX1-LABEL: trunc_and_v8i64_v8i16: 2617; AVX1: # %bb.0: 2618; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 2619; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2620; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2621; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2622; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2623; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2624; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2625; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2626; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2627; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2628; AVX1-NEXT: vzeroupper 2629; AVX1-NEXT: retq 2630; 2631; AVX2-LABEL: trunc_and_v8i64_v8i16: 2632; AVX2: # %bb.0: 2633; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2634; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2635; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2636; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2637; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2638; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2639; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2640; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2641; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2642; AVX2-NEXT: vzeroupper 2643; AVX2-NEXT: retq 2644; 2645; AVX512-LABEL: trunc_and_v8i64_v8i16: 2646; AVX512: # %bb.0: 2647; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 2648; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2649; AVX512-NEXT: vzeroupper 2650; AVX512-NEXT: retq 2651 %1 = and <8 x i64> %a0, %a1 2652 %2 = trunc <8 x i64> %1 to <8 x i16> 2653 ret <8 x i16> %2 2654} 2655 2656define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 2657; SSE-LABEL: trunc_and_v8i32_v8i16: 2658; SSE: # %bb.0: 2659; SSE-NEXT: pand %xmm2, %xmm0 2660; SSE-NEXT: pand %xmm3, %xmm1 2661; SSE-NEXT: pslld $16, %xmm1 2662; SSE-NEXT: psrad $16, %xmm1 2663; SSE-NEXT: pslld $16, %xmm0 2664; SSE-NEXT: psrad $16, %xmm0 2665; SSE-NEXT: packssdw %xmm1, %xmm0 2666; SSE-NEXT: retq 2667; 2668; AVX1-LABEL: trunc_and_v8i32_v8i16: 2669; AVX1: # %bb.0: 2670; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2671; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2672; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2673; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2674; AVX1-NEXT: vzeroupper 2675; AVX1-NEXT: retq 2676; 2677; AVX2-LABEL: trunc_and_v8i32_v8i16: 2678; AVX2: # %bb.0: 2679; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2680; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2681; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2682; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2683; AVX2-NEXT: vzeroupper 2684; AVX2-NEXT: retq 2685; 2686; AVX512-LABEL: trunc_and_v8i32_v8i16: 2687; AVX512: # %bb.0: 2688; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2689; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2690; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2691; AVX512-NEXT: vzeroupper 2692; AVX512-NEXT: retq 2693 %1 = and <8 x i32> %a0, %a1 2694 %2 = trunc <8 x i32> %1 to <8 x i16> 2695 ret <8 x i16> %2 2696} 2697 2698define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2699; SSE-LABEL: trunc_and_v16i64_v16i8: 2700; SSE: # %bb.0: 2701; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 2702; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 2703; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 2704; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 2705; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 2706; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 2707; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 2708; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 2709; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2710; SSE-NEXT: pand %xmm8, %xmm7 2711; SSE-NEXT: pand %xmm8, %xmm6 2712; SSE-NEXT: packuswb %xmm7, %xmm6 2713; SSE-NEXT: pand %xmm8, %xmm5 2714; SSE-NEXT: pand %xmm8, %xmm4 2715; SSE-NEXT: packuswb %xmm5, %xmm4 2716; SSE-NEXT: packuswb %xmm6, %xmm4 2717; SSE-NEXT: pand %xmm8, %xmm3 2718; SSE-NEXT: pand %xmm8, %xmm2 2719; SSE-NEXT: packuswb %xmm3, %xmm2 2720; SSE-NEXT: pand %xmm8, %xmm1 2721; SSE-NEXT: pand %xmm8, %xmm0 2722; SSE-NEXT: packuswb %xmm1, %xmm0 2723; SSE-NEXT: packuswb %xmm2, %xmm0 2724; SSE-NEXT: packuswb %xmm4, %xmm0 2725; SSE-NEXT: retq 2726; 2727; AVX1-LABEL: trunc_and_v16i64_v16i8: 2728; AVX1: # %bb.0: 2729; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] 2730; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 2731; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 2732; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 2733; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 2734; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6 2735; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 2736; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 2737; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 2738; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2739; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3 2740; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2741; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2742; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2743; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3 2744; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 2745; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2746; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2747; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2748; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2749; AVX1-NEXT: vzeroupper 2750; AVX1-NEXT: retq 2751; 2752; AVX2-LABEL: trunc_and_v16i64_v16i8: 2753; AVX2: # %bb.0: 2754; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255] 2755; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 2756; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 2757; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 2758; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 2759; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2760; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2761; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3 2762; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2763; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3 2764; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2765; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2766; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2767; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2768; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2769; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2770; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2771; AVX2-NEXT: vzeroupper 2772; AVX2-NEXT: retq 2773; 2774; AVX512-LABEL: trunc_and_v16i64_v16i8: 2775; AVX512: # %bb.0: 2776; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 2777; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 2778; AVX512-NEXT: vpmovqb %zmm1, %xmm1 2779; AVX512-NEXT: vpmovqb %zmm0, %xmm0 2780; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2781; AVX512-NEXT: vzeroupper 2782; AVX512-NEXT: retq 2783 %1 = and <16 x i64> %a0, %a1 2784 %2 = trunc <16 x i64> %1 to <16 x i8> 2785 ret <16 x i8> %2 2786} 2787 2788define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2789; SSE-LABEL: trunc_and_v16i32_v16i8: 2790; SSE: # %bb.0: 2791; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2792; SSE-NEXT: pand %xmm8, %xmm7 2793; SSE-NEXT: pand %xmm3, %xmm7 2794; SSE-NEXT: pand %xmm8, %xmm6 2795; SSE-NEXT: pand %xmm2, %xmm6 2796; SSE-NEXT: packuswb %xmm7, %xmm6 2797; SSE-NEXT: pand %xmm8, %xmm5 2798; SSE-NEXT: pand %xmm1, %xmm5 2799; SSE-NEXT: pand %xmm4, %xmm8 2800; SSE-NEXT: pand %xmm8, %xmm0 2801; SSE-NEXT: packuswb %xmm5, %xmm0 2802; SSE-NEXT: packuswb %xmm6, %xmm0 2803; SSE-NEXT: retq 2804; 2805; AVX1-LABEL: trunc_and_v16i32_v16i8: 2806; AVX1: # %bb.0: 2807; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2808; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2809; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2810; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2811; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2812; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2813; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2814; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2815; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2816; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2817; AVX1-NEXT: vzeroupper 2818; AVX1-NEXT: retq 2819; 2820; AVX2-LABEL: trunc_and_v16i32_v16i8: 2821; AVX2: # %bb.0: 2822; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2823; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2824; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2825; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2826; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2827; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2828; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2829; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2830; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2831; AVX2-NEXT: vzeroupper 2832; AVX2-NEXT: retq 2833; 2834; AVX512-LABEL: trunc_and_v16i32_v16i8: 2835; AVX512: # %bb.0: 2836; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 2837; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2838; AVX512-NEXT: vzeroupper 2839; AVX512-NEXT: retq 2840 %1 = and <16 x i32> %a0, %a1 2841 %2 = trunc <16 x i32> %1 to <16 x i8> 2842 ret <16 x i8> %2 2843} 2844 2845define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2846; SSE-LABEL: trunc_and_v16i16_v16i8: 2847; SSE: # %bb.0: 2848; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2849; SSE-NEXT: pand %xmm4, %xmm3 2850; SSE-NEXT: pand %xmm1, %xmm3 2851; SSE-NEXT: pand %xmm2, %xmm4 2852; SSE-NEXT: pand %xmm4, %xmm0 2853; SSE-NEXT: packuswb %xmm3, %xmm0 2854; SSE-NEXT: retq 2855; 2856; AVX1-LABEL: trunc_and_v16i16_v16i8: 2857; AVX1: # %bb.0: 2858; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2859; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2860; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2861; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2862; AVX1-NEXT: vzeroupper 2863; AVX1-NEXT: retq 2864; 2865; AVX2-LABEL: trunc_and_v16i16_v16i8: 2866; AVX2: # %bb.0: 2867; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2868; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2869; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2870; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2871; AVX2-NEXT: vzeroupper 2872; AVX2-NEXT: retq 2873; 2874; AVX512F-LABEL: trunc_and_v16i16_v16i8: 2875; AVX512F: # %bb.0: 2876; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 2877; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2878; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2879; AVX512F-NEXT: vzeroupper 2880; AVX512F-NEXT: retq 2881; 2882; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 2883; AVX512BW: # %bb.0: 2884; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 2885; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2886; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2887; AVX512BW-NEXT: vzeroupper 2888; AVX512BW-NEXT: retq 2889; 2890; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 2891; AVX512DQ: # %bb.0: 2892; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 2893; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2894; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2895; AVX512DQ-NEXT: vzeroupper 2896; AVX512DQ-NEXT: retq 2897 %1 = and <16 x i16> %a0, %a1 2898 %2 = trunc <16 x i16> %1 to <16 x i8> 2899 ret <16 x i8> %2 2900} 2901 2902; 2903; and to constant 2904; 2905 2906define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2907; SSE-LABEL: trunc_and_const_v4i64_v4i32: 2908; SSE: # %bb.0: 2909; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2910; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2911; SSE-NEXT: retq 2912; 2913; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 2914; AVX1: # %bb.0: 2915; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2916; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2917; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2918; AVX1-NEXT: vzeroupper 2919; AVX1-NEXT: retq 2920; 2921; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 2922; AVX2-SLOW: # %bb.0: 2923; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2924; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2925; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2926; AVX2-SLOW-NEXT: vzeroupper 2927; AVX2-SLOW-NEXT: retq 2928; 2929; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: 2930; AVX2-FAST-ALL: # %bb.0: 2931; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u> 2932; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2933; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2934; AVX2-FAST-ALL-NEXT: vzeroupper 2935; AVX2-FAST-ALL-NEXT: retq 2936; 2937; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: 2938; AVX2-FAST-PERLANE: # %bb.0: 2939; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2940; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2941; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2942; AVX2-FAST-PERLANE-NEXT: vzeroupper 2943; AVX2-FAST-PERLANE-NEXT: retq 2944; 2945; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 2946; AVX512: # %bb.0: 2947; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2948; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2949; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2950; AVX512-NEXT: vzeroupper 2951; AVX512-NEXT: retq 2952 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2953 %2 = trunc <4 x i64> %1 to <4 x i32> 2954 ret <4 x i32> %2 2955} 2956 2957define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2958; SSE-LABEL: trunc_and_const_v8i64_v8i16: 2959; SSE: # %bb.0: 2960; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2961; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2962; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2963; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2964; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2965; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2966; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2967; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2968; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2969; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2970; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2971; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2972; SSE-NEXT: retq 2973; 2974; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 2975; AVX1: # %bb.0: 2976; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2977; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2978; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2979; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2980; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2981; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2982; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2983; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2984; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2985; AVX1-NEXT: vzeroupper 2986; AVX1-NEXT: retq 2987; 2988; AVX2-LABEL: trunc_and_const_v8i64_v8i16: 2989; AVX2: # %bb.0: 2990; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2991; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2992; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2993; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2994; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2995; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2996; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2997; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2998; AVX2-NEXT: vzeroupper 2999; AVX2-NEXT: retq 3000; 3001; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 3002; AVX512: # %bb.0: 3003; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3004; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3005; AVX512-NEXT: vzeroupper 3006; AVX512-NEXT: retq 3007 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3008 %2 = trunc <8 x i64> %1 to <8 x i16> 3009 ret <8 x i16> %2 3010} 3011 3012define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3013; SSE-LABEL: trunc_and_const_v8i32_v8i16: 3014; SSE: # %bb.0: 3015; SSE-NEXT: pslld $16, %xmm1 3016; SSE-NEXT: psrad $16, %xmm1 3017; SSE-NEXT: pslld $16, %xmm0 3018; SSE-NEXT: psrad $16, %xmm0 3019; SSE-NEXT: packssdw %xmm1, %xmm0 3020; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3021; SSE-NEXT: retq 3022; 3023; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 3024; AVX1: # %bb.0: 3025; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3026; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3027; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3028; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3029; AVX1-NEXT: vzeroupper 3030; AVX1-NEXT: retq 3031; 3032; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 3033; AVX2: # %bb.0: 3034; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3035; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3036; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3037; AVX2-NEXT: vzeroupper 3038; AVX2-NEXT: retq 3039; 3040; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3041; AVX512: # %bb.0: 3042; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3043; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3044; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3045; AVX512-NEXT: vzeroupper 3046; AVX512-NEXT: retq 3047 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3048 %2 = trunc <8 x i32> %1 to <8 x i16> 3049 ret <8 x i16> %2 3050} 3051 3052define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3053; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3054; SSE: # %bb.0: 3055; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3056; SSE-NEXT: pand %xmm8, %xmm7 3057; SSE-NEXT: pand %xmm8, %xmm6 3058; SSE-NEXT: packuswb %xmm7, %xmm6 3059; SSE-NEXT: pand %xmm8, %xmm5 3060; SSE-NEXT: pand %xmm8, %xmm4 3061; SSE-NEXT: packuswb %xmm5, %xmm4 3062; SSE-NEXT: packuswb %xmm6, %xmm4 3063; SSE-NEXT: pand %xmm8, %xmm3 3064; SSE-NEXT: pand %xmm8, %xmm2 3065; SSE-NEXT: packuswb %xmm3, %xmm2 3066; SSE-NEXT: pand %xmm8, %xmm1 3067; SSE-NEXT: pand %xmm8, %xmm0 3068; SSE-NEXT: packuswb %xmm1, %xmm0 3069; SSE-NEXT: packuswb %xmm2, %xmm0 3070; SSE-NEXT: packuswb %xmm4, %xmm0 3071; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3072; SSE-NEXT: retq 3073; 3074; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3075; AVX1: # %bb.0: 3076; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3077; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3078; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3079; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3080; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3081; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3082; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3083; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3084; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3085; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3086; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3087; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3088; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3089; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3090; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3091; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3092; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3093; AVX1-NEXT: vzeroupper 3094; AVX1-NEXT: retq 3095; 3096; AVX2-LABEL: trunc_and_const_v16i64_v16i8: 3097; AVX2: # %bb.0: 3098; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3099; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3100; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3101; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3102; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3103; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3104; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3105; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3106; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3107; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3108; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3109; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3110; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3111; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3112; AVX2-NEXT: vzeroupper 3113; AVX2-NEXT: retq 3114; 3115; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3116; AVX512: # %bb.0: 3117; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3118; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3119; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3120; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3121; AVX512-NEXT: vzeroupper 3122; AVX512-NEXT: retq 3123 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3124 %2 = trunc <16 x i64> %1 to <16 x i8> 3125 ret <16 x i8> %2 3126} 3127 3128define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3129; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3130; SSE: # %bb.0: 3131; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3132; SSE-NEXT: pand %xmm4, %xmm3 3133; SSE-NEXT: pand %xmm4, %xmm2 3134; SSE-NEXT: packuswb %xmm3, %xmm2 3135; SSE-NEXT: pand %xmm4, %xmm1 3136; SSE-NEXT: pand %xmm4, %xmm0 3137; SSE-NEXT: packuswb %xmm1, %xmm0 3138; SSE-NEXT: packuswb %xmm2, %xmm0 3139; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3140; SSE-NEXT: retq 3141; 3142; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3143; AVX1: # %bb.0: 3144; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3145; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3146; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3147; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3148; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3149; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3150; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3151; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3152; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3153; AVX1-NEXT: vzeroupper 3154; AVX1-NEXT: retq 3155; 3156; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3157; AVX2: # %bb.0: 3158; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3159; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3160; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3161; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3162; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3163; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3164; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3165; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3166; AVX2-NEXT: vzeroupper 3167; AVX2-NEXT: retq 3168; 3169; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3170; AVX512: # %bb.0: 3171; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3172; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3173; AVX512-NEXT: vzeroupper 3174; AVX512-NEXT: retq 3175 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3176 %2 = trunc <16 x i32> %1 to <16 x i8> 3177 ret <16 x i8> %2 3178} 3179 3180define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3181; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3182; SSE: # %bb.0: 3183; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3184; SSE-NEXT: pand %xmm2, %xmm1 3185; SSE-NEXT: pand %xmm2, %xmm0 3186; SSE-NEXT: packuswb %xmm1, %xmm0 3187; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3188; SSE-NEXT: retq 3189; 3190; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3191; AVX1: # %bb.0: 3192; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3193; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3194; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3195; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3196; AVX1-NEXT: vzeroupper 3197; AVX1-NEXT: retq 3198; 3199; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3200; AVX2: # %bb.0: 3201; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3203; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3204; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3205; AVX2-NEXT: vzeroupper 3206; AVX2-NEXT: retq 3207; 3208; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3209; AVX512F: # %bb.0: 3210; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3211; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3212; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3213; AVX512F-NEXT: vzeroupper 3214; AVX512F-NEXT: retq 3215; 3216; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3217; AVX512BW: # %bb.0: 3218; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3219; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3220; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3221; AVX512BW-NEXT: vzeroupper 3222; AVX512BW-NEXT: retq 3223; 3224; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3225; AVX512DQ: # %bb.0: 3226; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3227; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3228; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3229; AVX512DQ-NEXT: vzeroupper 3230; AVX512DQ-NEXT: retq 3231 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3232 %2 = trunc <16 x i16> %1 to <16 x i8> 3233 ret <16 x i8> %2 3234} 3235 3236; 3237; xor 3238; 3239 3240define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3241; SSE-LABEL: trunc_xor_v4i64_v4i32: 3242; SSE: # %bb.0: 3243; SSE-NEXT: xorps %xmm3, %xmm1 3244; SSE-NEXT: xorps %xmm2, %xmm0 3245; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3246; SSE-NEXT: retq 3247; 3248; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3249; AVX1: # %bb.0: 3250; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3251; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3252; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3253; AVX1-NEXT: vzeroupper 3254; AVX1-NEXT: retq 3255; 3256; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3257; AVX2-SLOW: # %bb.0: 3258; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3259; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3260; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3261; AVX2-SLOW-NEXT: vzeroupper 3262; AVX2-SLOW-NEXT: retq 3263; 3264; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: 3265; AVX2-FAST-ALL: # %bb.0: 3266; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 3267; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3268; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3269; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3270; AVX2-FAST-ALL-NEXT: vzeroupper 3271; AVX2-FAST-ALL-NEXT: retq 3272; 3273; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: 3274; AVX2-FAST-PERLANE: # %bb.0: 3275; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 3276; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3277; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3278; AVX2-FAST-PERLANE-NEXT: vzeroupper 3279; AVX2-FAST-PERLANE-NEXT: retq 3280; 3281; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3282; AVX512: # %bb.0: 3283; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3284; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3285; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3286; AVX512-NEXT: vzeroupper 3287; AVX512-NEXT: retq 3288 %1 = xor <4 x i64> %a0, %a1 3289 %2 = trunc <4 x i64> %1 to <4 x i32> 3290 ret <4 x i32> %2 3291} 3292 3293define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3294; SSE-LABEL: trunc_xor_v8i64_v8i16: 3295; SSE: # %bb.0: 3296; SSE-NEXT: pxor %xmm6, %xmm2 3297; SSE-NEXT: pxor %xmm7, %xmm3 3298; SSE-NEXT: pxor %xmm4, %xmm0 3299; SSE-NEXT: pxor %xmm5, %xmm1 3300; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3301; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3302; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3303; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3304; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3305; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3306; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3307; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3308; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3309; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3310; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3311; SSE-NEXT: retq 3312; 3313; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3314; AVX1: # %bb.0: 3315; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3316; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3317; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3318; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3319; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3320; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3321; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3322; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3323; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3324; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3325; AVX1-NEXT: vzeroupper 3326; AVX1-NEXT: retq 3327; 3328; AVX2-LABEL: trunc_xor_v8i64_v8i16: 3329; AVX2: # %bb.0: 3330; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3331; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3332; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3333; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3334; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3335; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3336; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3337; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3338; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3339; AVX2-NEXT: vzeroupper 3340; AVX2-NEXT: retq 3341; 3342; AVX512-LABEL: trunc_xor_v8i64_v8i16: 3343; AVX512: # %bb.0: 3344; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3345; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3346; AVX512-NEXT: vzeroupper 3347; AVX512-NEXT: retq 3348 %1 = xor <8 x i64> %a0, %a1 3349 %2 = trunc <8 x i64> %1 to <8 x i16> 3350 ret <8 x i16> %2 3351} 3352 3353define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3354; SSE-LABEL: trunc_xor_v8i32_v8i16: 3355; SSE: # %bb.0: 3356; SSE-NEXT: pxor %xmm2, %xmm0 3357; SSE-NEXT: pxor %xmm3, %xmm1 3358; SSE-NEXT: pslld $16, %xmm1 3359; SSE-NEXT: psrad $16, %xmm1 3360; SSE-NEXT: pslld $16, %xmm0 3361; SSE-NEXT: psrad $16, %xmm0 3362; SSE-NEXT: packssdw %xmm1, %xmm0 3363; SSE-NEXT: retq 3364; 3365; AVX1-LABEL: trunc_xor_v8i32_v8i16: 3366; AVX1: # %bb.0: 3367; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3368; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3369; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3370; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3371; AVX1-NEXT: vzeroupper 3372; AVX1-NEXT: retq 3373; 3374; AVX2-LABEL: trunc_xor_v8i32_v8i16: 3375; AVX2: # %bb.0: 3376; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3377; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3378; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3379; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3380; AVX2-NEXT: vzeroupper 3381; AVX2-NEXT: retq 3382; 3383; AVX512-LABEL: trunc_xor_v8i32_v8i16: 3384; AVX512: # %bb.0: 3385; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3386; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3387; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3388; AVX512-NEXT: vzeroupper 3389; AVX512-NEXT: retq 3390 %1 = xor <8 x i32> %a0, %a1 3391 %2 = trunc <8 x i32> %1 to <8 x i16> 3392 ret <8 x i16> %2 3393} 3394 3395define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3396; SSE-LABEL: trunc_xor_v16i64_v16i8: 3397; SSE: # %bb.0: 3398; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3399; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3400; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3401; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3402; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3403; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3404; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3405; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3406; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3407; SSE-NEXT: pand %xmm8, %xmm7 3408; SSE-NEXT: pand %xmm8, %xmm6 3409; SSE-NEXT: packuswb %xmm7, %xmm6 3410; SSE-NEXT: pand %xmm8, %xmm5 3411; SSE-NEXT: pand %xmm8, %xmm4 3412; SSE-NEXT: packuswb %xmm5, %xmm4 3413; SSE-NEXT: packuswb %xmm6, %xmm4 3414; SSE-NEXT: pand %xmm8, %xmm3 3415; SSE-NEXT: pand %xmm8, %xmm2 3416; SSE-NEXT: packuswb %xmm3, %xmm2 3417; SSE-NEXT: pand %xmm8, %xmm1 3418; SSE-NEXT: pand %xmm8, %xmm0 3419; SSE-NEXT: packuswb %xmm1, %xmm0 3420; SSE-NEXT: packuswb %xmm2, %xmm0 3421; SSE-NEXT: packuswb %xmm4, %xmm0 3422; SSE-NEXT: retq 3423; 3424; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3425; AVX1: # %bb.0: 3426; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3427; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3428; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3429; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3430; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3431; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3432; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3433; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3434; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3435; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3436; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3437; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3438; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3439; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3440; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3441; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3442; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3443; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3444; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3445; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3446; AVX1-NEXT: vzeroupper 3447; AVX1-NEXT: retq 3448; 3449; AVX2-LABEL: trunc_xor_v16i64_v16i8: 3450; AVX2: # %bb.0: 3451; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 3452; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 3453; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 3454; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 3455; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3456; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3457; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3458; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3459; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3460; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3461; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3462; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3463; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3464; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3465; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3466; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3467; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3468; AVX2-NEXT: vzeroupper 3469; AVX2-NEXT: retq 3470; 3471; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3472; AVX512: # %bb.0: 3473; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3474; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3475; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3476; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3477; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3478; AVX512-NEXT: vzeroupper 3479; AVX512-NEXT: retq 3480 %1 = xor <16 x i64> %a0, %a1 3481 %2 = trunc <16 x i64> %1 to <16 x i8> 3482 ret <16 x i8> %2 3483} 3484 3485define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3486; SSE-LABEL: trunc_xor_v16i32_v16i8: 3487; SSE: # %bb.0: 3488; SSE-NEXT: pxor %xmm4, %xmm0 3489; SSE-NEXT: pxor %xmm5, %xmm1 3490; SSE-NEXT: pxor %xmm6, %xmm2 3491; SSE-NEXT: pxor %xmm7, %xmm3 3492; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3493; SSE-NEXT: pand %xmm4, %xmm3 3494; SSE-NEXT: pand %xmm4, %xmm2 3495; SSE-NEXT: packuswb %xmm3, %xmm2 3496; SSE-NEXT: pand %xmm4, %xmm1 3497; SSE-NEXT: pand %xmm4, %xmm0 3498; SSE-NEXT: packuswb %xmm1, %xmm0 3499; SSE-NEXT: packuswb %xmm2, %xmm0 3500; SSE-NEXT: retq 3501; 3502; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3503; AVX1: # %bb.0: 3504; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3505; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3506; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3507; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3508; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3509; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3510; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3511; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3512; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3513; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3514; AVX1-NEXT: vzeroupper 3515; AVX1-NEXT: retq 3516; 3517; AVX2-LABEL: trunc_xor_v16i32_v16i8: 3518; AVX2: # %bb.0: 3519; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3520; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3521; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3522; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3523; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3524; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3525; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3526; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3527; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3528; AVX2-NEXT: vzeroupper 3529; AVX2-NEXT: retq 3530; 3531; AVX512-LABEL: trunc_xor_v16i32_v16i8: 3532; AVX512: # %bb.0: 3533; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 3534; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3535; AVX512-NEXT: vzeroupper 3536; AVX512-NEXT: retq 3537 %1 = xor <16 x i32> %a0, %a1 3538 %2 = trunc <16 x i32> %1 to <16 x i8> 3539 ret <16 x i8> %2 3540} 3541 3542define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3543; SSE-LABEL: trunc_xor_v16i16_v16i8: 3544; SSE: # %bb.0: 3545; SSE-NEXT: pxor %xmm2, %xmm0 3546; SSE-NEXT: pxor %xmm3, %xmm1 3547; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3548; SSE-NEXT: pand %xmm2, %xmm1 3549; SSE-NEXT: pand %xmm2, %xmm0 3550; SSE-NEXT: packuswb %xmm1, %xmm0 3551; SSE-NEXT: retq 3552; 3553; AVX1-LABEL: trunc_xor_v16i16_v16i8: 3554; AVX1: # %bb.0: 3555; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3556; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3557; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3558; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3559; AVX1-NEXT: vzeroupper 3560; AVX1-NEXT: retq 3561; 3562; AVX2-LABEL: trunc_xor_v16i16_v16i8: 3563; AVX2: # %bb.0: 3564; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3565; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3566; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3567; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3568; AVX2-NEXT: vzeroupper 3569; AVX2-NEXT: retq 3570; 3571; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 3572; AVX512F: # %bb.0: 3573; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 3574; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3575; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3576; AVX512F-NEXT: vzeroupper 3577; AVX512F-NEXT: retq 3578; 3579; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 3580; AVX512BW: # %bb.0: 3581; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 3582; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3583; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3584; AVX512BW-NEXT: vzeroupper 3585; AVX512BW-NEXT: retq 3586; 3587; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 3588; AVX512DQ: # %bb.0: 3589; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 3590; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3591; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3592; AVX512DQ-NEXT: vzeroupper 3593; AVX512DQ-NEXT: retq 3594 %1 = xor <16 x i16> %a0, %a1 3595 %2 = trunc <16 x i16> %1 to <16 x i8> 3596 ret <16 x i8> %2 3597} 3598 3599; 3600; xor to constant 3601; 3602 3603define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3604; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 3605; SSE: # %bb.0: 3606; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3607; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3608; SSE-NEXT: retq 3609; 3610; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 3611; AVX1: # %bb.0: 3612; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3613; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3614; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3615; AVX1-NEXT: vzeroupper 3616; AVX1-NEXT: retq 3617; 3618; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 3619; AVX2-SLOW: # %bb.0: 3620; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3621; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3622; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3623; AVX2-SLOW-NEXT: vzeroupper 3624; AVX2-SLOW-NEXT: retq 3625; 3626; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: 3627; AVX2-FAST-ALL: # %bb.0: 3628; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3629; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3630; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3631; AVX2-FAST-ALL-NEXT: vzeroupper 3632; AVX2-FAST-ALL-NEXT: retq 3633; 3634; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: 3635; AVX2-FAST-PERLANE: # %bb.0: 3636; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3637; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3638; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3639; AVX2-FAST-PERLANE-NEXT: vzeroupper 3640; AVX2-FAST-PERLANE-NEXT: retq 3641; 3642; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 3643; AVX512: # %bb.0: 3644; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3645; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3646; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3647; AVX512-NEXT: vzeroupper 3648; AVX512-NEXT: retq 3649 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3650 %2 = trunc <4 x i64> %1 to <4 x i32> 3651 ret <4 x i32> %2 3652} 3653 3654define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3655; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 3656; SSE: # %bb.0: 3657; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3658; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3659; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3660; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3661; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3662; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3663; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3664; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3665; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3666; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3667; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3668; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3669; SSE-NEXT: retq 3670; 3671; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 3672; AVX1: # %bb.0: 3673; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3674; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3675; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3676; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3677; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3678; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3679; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3680; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3681; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3682; AVX1-NEXT: vzeroupper 3683; AVX1-NEXT: retq 3684; 3685; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: 3686; AVX2: # %bb.0: 3687; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3688; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3689; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3690; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3691; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3692; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3693; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3694; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3695; AVX2-NEXT: vzeroupper 3696; AVX2-NEXT: retq 3697; 3698; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 3699; AVX512: # %bb.0: 3700; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3701; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3702; AVX512-NEXT: vzeroupper 3703; AVX512-NEXT: retq 3704 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3705 %2 = trunc <8 x i64> %1 to <8 x i16> 3706 ret <8 x i16> %2 3707} 3708 3709define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3710; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 3711; SSE: # %bb.0: 3712; SSE-NEXT: pslld $16, %xmm1 3713; SSE-NEXT: psrad $16, %xmm1 3714; SSE-NEXT: pslld $16, %xmm0 3715; SSE-NEXT: psrad $16, %xmm0 3716; SSE-NEXT: packssdw %xmm1, %xmm0 3717; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3718; SSE-NEXT: retq 3719; 3720; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 3721; AVX1: # %bb.0: 3722; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3723; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3724; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3725; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3726; AVX1-NEXT: vzeroupper 3727; AVX1-NEXT: retq 3728; 3729; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 3730; AVX2: # %bb.0: 3731; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3732; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3733; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3734; AVX2-NEXT: vzeroupper 3735; AVX2-NEXT: retq 3736; 3737; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 3738; AVX512: # %bb.0: 3739; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3740; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3741; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3742; AVX512-NEXT: vzeroupper 3743; AVX512-NEXT: retq 3744 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3745 %2 = trunc <8 x i32> %1 to <8 x i16> 3746 ret <8 x i16> %2 3747} 3748 3749define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3750; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 3751; SSE: # %bb.0: 3752; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3753; SSE-NEXT: pand %xmm8, %xmm7 3754; SSE-NEXT: pand %xmm8, %xmm6 3755; SSE-NEXT: packuswb %xmm7, %xmm6 3756; SSE-NEXT: pand %xmm8, %xmm5 3757; SSE-NEXT: pand %xmm8, %xmm4 3758; SSE-NEXT: packuswb %xmm5, %xmm4 3759; SSE-NEXT: packuswb %xmm6, %xmm4 3760; SSE-NEXT: pand %xmm8, %xmm3 3761; SSE-NEXT: pand %xmm8, %xmm2 3762; SSE-NEXT: packuswb %xmm3, %xmm2 3763; SSE-NEXT: pand %xmm8, %xmm1 3764; SSE-NEXT: pand %xmm8, %xmm0 3765; SSE-NEXT: packuswb %xmm1, %xmm0 3766; SSE-NEXT: packuswb %xmm2, %xmm0 3767; SSE-NEXT: packuswb %xmm4, %xmm0 3768; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3769; SSE-NEXT: retq 3770; 3771; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 3772; AVX1: # %bb.0: 3773; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3774; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3775; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3776; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3777; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3778; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3779; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3780; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3781; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3782; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3783; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3784; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3785; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3786; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3787; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3788; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3789; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3790; AVX1-NEXT: vzeroupper 3791; AVX1-NEXT: retq 3792; 3793; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: 3794; AVX2: # %bb.0: 3795; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3796; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3797; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3798; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3799; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3800; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3801; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3802; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3803; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3804; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3805; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3806; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3807; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3808; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3809; AVX2-NEXT: vzeroupper 3810; AVX2-NEXT: retq 3811; 3812; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 3813; AVX512: # %bb.0: 3814; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3815; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3816; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3817; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3818; AVX512-NEXT: vzeroupper 3819; AVX512-NEXT: retq 3820 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3821 %2 = trunc <16 x i64> %1 to <16 x i8> 3822 ret <16 x i8> %2 3823} 3824 3825define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3826; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 3827; SSE: # %bb.0: 3828; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3829; SSE-NEXT: pand %xmm4, %xmm3 3830; SSE-NEXT: pand %xmm4, %xmm2 3831; SSE-NEXT: packuswb %xmm3, %xmm2 3832; SSE-NEXT: pand %xmm4, %xmm1 3833; SSE-NEXT: pand %xmm4, %xmm0 3834; SSE-NEXT: packuswb %xmm1, %xmm0 3835; SSE-NEXT: packuswb %xmm2, %xmm0 3836; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3837; SSE-NEXT: retq 3838; 3839; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 3840; AVX1: # %bb.0: 3841; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3842; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3843; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3844; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3845; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3846; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3847; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3848; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3849; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3850; AVX1-NEXT: vzeroupper 3851; AVX1-NEXT: retq 3852; 3853; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 3854; AVX2: # %bb.0: 3855; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3856; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3857; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3858; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3859; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3860; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3861; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3862; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3863; AVX2-NEXT: vzeroupper 3864; AVX2-NEXT: retq 3865; 3866; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 3867; AVX512: # %bb.0: 3868; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3869; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3870; AVX512-NEXT: vzeroupper 3871; AVX512-NEXT: retq 3872 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3873 %2 = trunc <16 x i32> %1 to <16 x i8> 3874 ret <16 x i8> %2 3875} 3876 3877define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3878; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 3879; SSE: # %bb.0: 3880; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3881; SSE-NEXT: pand %xmm2, %xmm1 3882; SSE-NEXT: pand %xmm2, %xmm0 3883; SSE-NEXT: packuswb %xmm1, %xmm0 3884; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3885; SSE-NEXT: retq 3886; 3887; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 3888; AVX1: # %bb.0: 3889; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3890; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3891; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3892; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3893; AVX1-NEXT: vzeroupper 3894; AVX1-NEXT: retq 3895; 3896; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 3897; AVX2: # %bb.0: 3898; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3899; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3900; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3901; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3902; AVX2-NEXT: vzeroupper 3903; AVX2-NEXT: retq 3904; 3905; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 3906; AVX512F: # %bb.0: 3907; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3908; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3909; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3910; AVX512F-NEXT: vzeroupper 3911; AVX512F-NEXT: retq 3912; 3913; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 3914; AVX512BW: # %bb.0: 3915; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3916; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3917; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3918; AVX512BW-NEXT: vzeroupper 3919; AVX512BW-NEXT: retq 3920; 3921; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 3922; AVX512DQ: # %bb.0: 3923; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3924; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3925; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3926; AVX512DQ-NEXT: vzeroupper 3927; AVX512DQ-NEXT: retq 3928 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3929 %2 = trunc <16 x i16> %1 to <16 x i8> 3930 ret <16 x i8> %2 3931} 3932 3933; 3934; or 3935; 3936 3937define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3938; SSE-LABEL: trunc_or_v4i64_v4i32: 3939; SSE: # %bb.0: 3940; SSE-NEXT: orps %xmm3, %xmm1 3941; SSE-NEXT: orps %xmm2, %xmm0 3942; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3943; SSE-NEXT: retq 3944; 3945; AVX1-LABEL: trunc_or_v4i64_v4i32: 3946; AVX1: # %bb.0: 3947; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 3948; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3949; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3950; AVX1-NEXT: vzeroupper 3951; AVX1-NEXT: retq 3952; 3953; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 3954; AVX2-SLOW: # %bb.0: 3955; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 3956; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3957; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3958; AVX2-SLOW-NEXT: vzeroupper 3959; AVX2-SLOW-NEXT: retq 3960; 3961; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: 3962; AVX2-FAST-ALL: # %bb.0: 3963; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 3964; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3965; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3966; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3967; AVX2-FAST-ALL-NEXT: vzeroupper 3968; AVX2-FAST-ALL-NEXT: retq 3969; 3970; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: 3971; AVX2-FAST-PERLANE: # %bb.0: 3972; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 3973; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3974; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3975; AVX2-FAST-PERLANE-NEXT: vzeroupper 3976; AVX2-FAST-PERLANE-NEXT: retq 3977; 3978; AVX512-LABEL: trunc_or_v4i64_v4i32: 3979; AVX512: # %bb.0: 3980; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 3981; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3982; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3983; AVX512-NEXT: vzeroupper 3984; AVX512-NEXT: retq 3985 %1 = or <4 x i64> %a0, %a1 3986 %2 = trunc <4 x i64> %1 to <4 x i32> 3987 ret <4 x i32> %2 3988} 3989 3990define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3991; SSE-LABEL: trunc_or_v8i64_v8i16: 3992; SSE: # %bb.0: 3993; SSE-NEXT: por %xmm6, %xmm2 3994; SSE-NEXT: por %xmm7, %xmm3 3995; SSE-NEXT: por %xmm4, %xmm0 3996; SSE-NEXT: por %xmm5, %xmm1 3997; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3998; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3999; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4000; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4001; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4002; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4003; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4004; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4005; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4006; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4007; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4008; SSE-NEXT: retq 4009; 4010; AVX1-LABEL: trunc_or_v8i64_v8i16: 4011; AVX1: # %bb.0: 4012; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4013; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4014; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4015; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4016; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4017; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4018; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4019; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4020; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4021; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4022; AVX1-NEXT: vzeroupper 4023; AVX1-NEXT: retq 4024; 4025; AVX2-LABEL: trunc_or_v8i64_v8i16: 4026; AVX2: # %bb.0: 4027; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4028; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4029; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4030; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4031; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4032; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4033; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4034; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4035; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4036; AVX2-NEXT: vzeroupper 4037; AVX2-NEXT: retq 4038; 4039; AVX512-LABEL: trunc_or_v8i64_v8i16: 4040; AVX512: # %bb.0: 4041; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4042; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4043; AVX512-NEXT: vzeroupper 4044; AVX512-NEXT: retq 4045 %1 = or <8 x i64> %a0, %a1 4046 %2 = trunc <8 x i64> %1 to <8 x i16> 4047 ret <8 x i16> %2 4048} 4049 4050define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4051; SSE-LABEL: trunc_or_v8i32_v8i16: 4052; SSE: # %bb.0: 4053; SSE-NEXT: por %xmm2, %xmm0 4054; SSE-NEXT: por %xmm3, %xmm1 4055; SSE-NEXT: pslld $16, %xmm1 4056; SSE-NEXT: psrad $16, %xmm1 4057; SSE-NEXT: pslld $16, %xmm0 4058; SSE-NEXT: psrad $16, %xmm0 4059; SSE-NEXT: packssdw %xmm1, %xmm0 4060; SSE-NEXT: retq 4061; 4062; AVX1-LABEL: trunc_or_v8i32_v8i16: 4063; AVX1: # %bb.0: 4064; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4065; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4066; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4067; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4068; AVX1-NEXT: vzeroupper 4069; AVX1-NEXT: retq 4070; 4071; AVX2-LABEL: trunc_or_v8i32_v8i16: 4072; AVX2: # %bb.0: 4073; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4074; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4075; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4076; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4077; AVX2-NEXT: vzeroupper 4078; AVX2-NEXT: retq 4079; 4080; AVX512-LABEL: trunc_or_v8i32_v8i16: 4081; AVX512: # %bb.0: 4082; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4083; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4084; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4085; AVX512-NEXT: vzeroupper 4086; AVX512-NEXT: retq 4087 %1 = or <8 x i32> %a0, %a1 4088 %2 = trunc <8 x i32> %1 to <8 x i16> 4089 ret <8 x i16> %2 4090} 4091 4092define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4093; SSE-LABEL: trunc_or_v16i64_v16i8: 4094; SSE: # %bb.0: 4095; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4096; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4097; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4098; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4099; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4100; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4101; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4102; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4103; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4104; SSE-NEXT: pand %xmm8, %xmm7 4105; SSE-NEXT: pand %xmm8, %xmm6 4106; SSE-NEXT: packuswb %xmm7, %xmm6 4107; SSE-NEXT: pand %xmm8, %xmm5 4108; SSE-NEXT: pand %xmm8, %xmm4 4109; SSE-NEXT: packuswb %xmm5, %xmm4 4110; SSE-NEXT: packuswb %xmm6, %xmm4 4111; SSE-NEXT: pand %xmm8, %xmm3 4112; SSE-NEXT: pand %xmm8, %xmm2 4113; SSE-NEXT: packuswb %xmm3, %xmm2 4114; SSE-NEXT: pand %xmm8, %xmm1 4115; SSE-NEXT: pand %xmm8, %xmm0 4116; SSE-NEXT: packuswb %xmm1, %xmm0 4117; SSE-NEXT: packuswb %xmm2, %xmm0 4118; SSE-NEXT: packuswb %xmm4, %xmm0 4119; SSE-NEXT: retq 4120; 4121; AVX1-LABEL: trunc_or_v16i64_v16i8: 4122; AVX1: # %bb.0: 4123; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4124; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4125; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4126; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4127; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4128; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4129; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4130; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4131; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4132; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4133; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4134; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4135; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4136; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4137; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4138; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4139; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4140; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4141; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4142; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4143; AVX1-NEXT: vzeroupper 4144; AVX1-NEXT: retq 4145; 4146; AVX2-LABEL: trunc_or_v16i64_v16i8: 4147; AVX2: # %bb.0: 4148; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 4149; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 4150; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 4151; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 4152; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4153; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4154; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4155; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4156; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4157; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4158; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4159; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4160; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4161; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4162; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4163; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4164; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4165; AVX2-NEXT: vzeroupper 4166; AVX2-NEXT: retq 4167; 4168; AVX512-LABEL: trunc_or_v16i64_v16i8: 4169; AVX512: # %bb.0: 4170; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4171; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4172; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4173; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4174; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4175; AVX512-NEXT: vzeroupper 4176; AVX512-NEXT: retq 4177 %1 = or <16 x i64> %a0, %a1 4178 %2 = trunc <16 x i64> %1 to <16 x i8> 4179 ret <16 x i8> %2 4180} 4181 4182define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4183; SSE-LABEL: trunc_or_v16i32_v16i8: 4184; SSE: # %bb.0: 4185; SSE-NEXT: por %xmm4, %xmm0 4186; SSE-NEXT: por %xmm5, %xmm1 4187; SSE-NEXT: por %xmm6, %xmm2 4188; SSE-NEXT: por %xmm7, %xmm3 4189; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4190; SSE-NEXT: pand %xmm4, %xmm3 4191; SSE-NEXT: pand %xmm4, %xmm2 4192; SSE-NEXT: packuswb %xmm3, %xmm2 4193; SSE-NEXT: pand %xmm4, %xmm1 4194; SSE-NEXT: pand %xmm4, %xmm0 4195; SSE-NEXT: packuswb %xmm1, %xmm0 4196; SSE-NEXT: packuswb %xmm2, %xmm0 4197; SSE-NEXT: retq 4198; 4199; AVX1-LABEL: trunc_or_v16i32_v16i8: 4200; AVX1: # %bb.0: 4201; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4202; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4203; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4204; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4205; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4206; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4207; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4208; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4209; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4210; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4211; AVX1-NEXT: vzeroupper 4212; AVX1-NEXT: retq 4213; 4214; AVX2-LABEL: trunc_or_v16i32_v16i8: 4215; AVX2: # %bb.0: 4216; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4217; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4218; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4219; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4220; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4221; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4222; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4223; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4224; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4225; AVX2-NEXT: vzeroupper 4226; AVX2-NEXT: retq 4227; 4228; AVX512-LABEL: trunc_or_v16i32_v16i8: 4229; AVX512: # %bb.0: 4230; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4231; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4232; AVX512-NEXT: vzeroupper 4233; AVX512-NEXT: retq 4234 %1 = or <16 x i32> %a0, %a1 4235 %2 = trunc <16 x i32> %1 to <16 x i8> 4236 ret <16 x i8> %2 4237} 4238 4239define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4240; SSE-LABEL: trunc_or_v16i16_v16i8: 4241; SSE: # %bb.0: 4242; SSE-NEXT: por %xmm2, %xmm0 4243; SSE-NEXT: por %xmm3, %xmm1 4244; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4245; SSE-NEXT: pand %xmm2, %xmm1 4246; SSE-NEXT: pand %xmm2, %xmm0 4247; SSE-NEXT: packuswb %xmm1, %xmm0 4248; SSE-NEXT: retq 4249; 4250; AVX1-LABEL: trunc_or_v16i16_v16i8: 4251; AVX1: # %bb.0: 4252; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4253; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4254; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4255; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4256; AVX1-NEXT: vzeroupper 4257; AVX1-NEXT: retq 4258; 4259; AVX2-LABEL: trunc_or_v16i16_v16i8: 4260; AVX2: # %bb.0: 4261; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4262; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4263; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4264; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4265; AVX2-NEXT: vzeroupper 4266; AVX2-NEXT: retq 4267; 4268; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4269; AVX512F: # %bb.0: 4270; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 4271; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4272; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4273; AVX512F-NEXT: vzeroupper 4274; AVX512F-NEXT: retq 4275; 4276; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4277; AVX512BW: # %bb.0: 4278; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 4279; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4280; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4281; AVX512BW-NEXT: vzeroupper 4282; AVX512BW-NEXT: retq 4283; 4284; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 4285; AVX512DQ: # %bb.0: 4286; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 4287; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4288; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4289; AVX512DQ-NEXT: vzeroupper 4290; AVX512DQ-NEXT: retq 4291 %1 = or <16 x i16> %a0, %a1 4292 %2 = trunc <16 x i16> %1 to <16 x i8> 4293 ret <16 x i8> %2 4294} 4295 4296; 4297; or to constant 4298; 4299 4300define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4301; SSE-LABEL: trunc_or_const_v4i64_v4i32: 4302; SSE: # %bb.0: 4303; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4304; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4305; SSE-NEXT: retq 4306; 4307; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 4308; AVX1: # %bb.0: 4309; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4310; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4311; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4312; AVX1-NEXT: vzeroupper 4313; AVX1-NEXT: retq 4314; 4315; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 4316; AVX2-SLOW: # %bb.0: 4317; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4318; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4319; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4320; AVX2-SLOW-NEXT: vzeroupper 4321; AVX2-SLOW-NEXT: retq 4322; 4323; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: 4324; AVX2-FAST-ALL: # %bb.0: 4325; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 4326; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 4327; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4328; AVX2-FAST-ALL-NEXT: vzeroupper 4329; AVX2-FAST-ALL-NEXT: retq 4330; 4331; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: 4332; AVX2-FAST-PERLANE: # %bb.0: 4333; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 4334; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4335; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4336; AVX2-FAST-PERLANE-NEXT: vzeroupper 4337; AVX2-FAST-PERLANE-NEXT: retq 4338; 4339; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 4340; AVX512: # %bb.0: 4341; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4342; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4343; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4344; AVX512-NEXT: vzeroupper 4345; AVX512-NEXT: retq 4346 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4347 %2 = trunc <4 x i64> %1 to <4 x i32> 4348 ret <4 x i32> %2 4349} 4350 4351define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4352; SSE-LABEL: trunc_or_const_v8i64_v8i16: 4353; SSE: # %bb.0: 4354; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4355; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4356; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4357; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4358; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4359; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4360; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4361; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4362; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4363; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4364; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4365; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4366; SSE-NEXT: retq 4367; 4368; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 4369; AVX1: # %bb.0: 4370; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4371; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4372; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4373; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4374; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4375; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4376; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4377; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4378; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4379; AVX1-NEXT: vzeroupper 4380; AVX1-NEXT: retq 4381; 4382; AVX2-LABEL: trunc_or_const_v8i64_v8i16: 4383; AVX2: # %bb.0: 4384; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4385; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4386; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4387; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4388; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4389; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4390; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4391; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4392; AVX2-NEXT: vzeroupper 4393; AVX2-NEXT: retq 4394; 4395; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 4396; AVX512: # %bb.0: 4397; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4398; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4399; AVX512-NEXT: vzeroupper 4400; AVX512-NEXT: retq 4401 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4402 %2 = trunc <8 x i64> %1 to <8 x i16> 4403 ret <8 x i16> %2 4404} 4405 4406define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4407; SSE-LABEL: trunc_or_const_v8i32_v8i16: 4408; SSE: # %bb.0: 4409; SSE-NEXT: pslld $16, %xmm1 4410; SSE-NEXT: psrad $16, %xmm1 4411; SSE-NEXT: pslld $16, %xmm0 4412; SSE-NEXT: psrad $16, %xmm0 4413; SSE-NEXT: packssdw %xmm1, %xmm0 4414; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4415; SSE-NEXT: retq 4416; 4417; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 4418; AVX1: # %bb.0: 4419; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4420; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4421; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4422; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4423; AVX1-NEXT: vzeroupper 4424; AVX1-NEXT: retq 4425; 4426; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 4427; AVX2: # %bb.0: 4428; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4429; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4430; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4431; AVX2-NEXT: vzeroupper 4432; AVX2-NEXT: retq 4433; 4434; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 4435; AVX512: # %bb.0: 4436; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4437; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4438; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4439; AVX512-NEXT: vzeroupper 4440; AVX512-NEXT: retq 4441 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4442 %2 = trunc <8 x i32> %1 to <8 x i16> 4443 ret <8 x i16> %2 4444} 4445 4446define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4447; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4448; SSE: # %bb.0: 4449; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4450; SSE-NEXT: pand %xmm8, %xmm7 4451; SSE-NEXT: pand %xmm8, %xmm6 4452; SSE-NEXT: packuswb %xmm7, %xmm6 4453; SSE-NEXT: pand %xmm8, %xmm5 4454; SSE-NEXT: pand %xmm8, %xmm4 4455; SSE-NEXT: packuswb %xmm5, %xmm4 4456; SSE-NEXT: packuswb %xmm6, %xmm4 4457; SSE-NEXT: pand %xmm8, %xmm3 4458; SSE-NEXT: pand %xmm8, %xmm2 4459; SSE-NEXT: packuswb %xmm3, %xmm2 4460; SSE-NEXT: pand %xmm8, %xmm1 4461; SSE-NEXT: pand %xmm8, %xmm0 4462; SSE-NEXT: packuswb %xmm1, %xmm0 4463; SSE-NEXT: packuswb %xmm2, %xmm0 4464; SSE-NEXT: packuswb %xmm4, %xmm0 4465; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4466; SSE-NEXT: retq 4467; 4468; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 4469; AVX1: # %bb.0: 4470; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4471; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4472; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4473; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4474; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4475; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4476; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4477; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4478; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4479; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4480; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4481; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4482; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4483; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4484; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4485; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4486; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4487; AVX1-NEXT: vzeroupper 4488; AVX1-NEXT: retq 4489; 4490; AVX2-LABEL: trunc_or_const_v16i64_v16i8: 4491; AVX2: # %bb.0: 4492; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4493; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4494; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4495; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4496; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4497; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4498; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4499; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4500; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4501; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4502; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4503; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4504; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4505; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4506; AVX2-NEXT: vzeroupper 4507; AVX2-NEXT: retq 4508; 4509; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 4510; AVX512: # %bb.0: 4511; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4512; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4513; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4514; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4515; AVX512-NEXT: vzeroupper 4516; AVX512-NEXT: retq 4517 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4518 %2 = trunc <16 x i64> %1 to <16 x i8> 4519 ret <16 x i8> %2 4520} 4521 4522define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4523; SSE-LABEL: trunc_or_const_v16i32_v16i8: 4524; SSE: # %bb.0: 4525; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4526; SSE-NEXT: pand %xmm4, %xmm3 4527; SSE-NEXT: pand %xmm4, %xmm2 4528; SSE-NEXT: packuswb %xmm3, %xmm2 4529; SSE-NEXT: pand %xmm4, %xmm1 4530; SSE-NEXT: pand %xmm4, %xmm0 4531; SSE-NEXT: packuswb %xmm1, %xmm0 4532; SSE-NEXT: packuswb %xmm2, %xmm0 4533; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4534; SSE-NEXT: retq 4535; 4536; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 4537; AVX1: # %bb.0: 4538; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4539; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4540; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4541; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4542; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4543; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4544; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4545; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4546; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4547; AVX1-NEXT: vzeroupper 4548; AVX1-NEXT: retq 4549; 4550; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 4551; AVX2: # %bb.0: 4552; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4553; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4554; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4555; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4556; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4557; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4558; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4559; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4560; AVX2-NEXT: vzeroupper 4561; AVX2-NEXT: retq 4562; 4563; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 4564; AVX512: # %bb.0: 4565; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4566; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4567; AVX512-NEXT: vzeroupper 4568; AVX512-NEXT: retq 4569 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4570 %2 = trunc <16 x i32> %1 to <16 x i8> 4571 ret <16 x i8> %2 4572} 4573 4574define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4575; SSE-LABEL: trunc_or_const_v16i16_v16i8: 4576; SSE: # %bb.0: 4577; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4578; SSE-NEXT: pand %xmm2, %xmm1 4579; SSE-NEXT: pand %xmm2, %xmm0 4580; SSE-NEXT: packuswb %xmm1, %xmm0 4581; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4582; SSE-NEXT: retq 4583; 4584; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 4585; AVX1: # %bb.0: 4586; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4587; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4588; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4589; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4590; AVX1-NEXT: vzeroupper 4591; AVX1-NEXT: retq 4592; 4593; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 4594; AVX2: # %bb.0: 4595; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4596; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4597; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4598; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4599; AVX2-NEXT: vzeroupper 4600; AVX2-NEXT: retq 4601; 4602; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 4603; AVX512F: # %bb.0: 4604; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4605; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4606; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4607; AVX512F-NEXT: vzeroupper 4608; AVX512F-NEXT: retq 4609; 4610; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 4611; AVX512BW: # %bb.0: 4612; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4613; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4614; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4615; AVX512BW-NEXT: vzeroupper 4616; AVX512BW-NEXT: retq 4617; 4618; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 4619; AVX512DQ: # %bb.0: 4620; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4621; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4622; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4623; AVX512DQ-NEXT: vzeroupper 4624; AVX512DQ-NEXT: retq 4625 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4626 %2 = trunc <16 x i16> %1 to <16 x i8> 4627 ret <16 x i8> %2 4628} 4629 4630; 4631; complex patterns - often created by vectorizer 4632; 4633 4634define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4635; SSE-LABEL: mul_add_const_v4i64_v4i32: 4636; SSE: # %bb.0: 4637; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4638; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4639; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4640; SSE-NEXT: pmuludq %xmm2, %xmm0 4641; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4642; SSE-NEXT: pmuludq %xmm3, %xmm1 4643; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4644; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4645; SSE-NEXT: retq 4646; 4647; AVX-LABEL: mul_add_const_v4i64_v4i32: 4648; AVX: # %bb.0: 4649; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4650; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4651; AVX-NEXT: retq 4652 %1 = sext <4 x i32> %a0 to <4 x i64> 4653 %2 = sext <4 x i32> %a1 to <4 x i64> 4654 %3 = mul <4 x i64> %1, %2 4655 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 4656 %5 = trunc <4 x i64> %4 to <4 x i32> 4657 ret <4 x i32> %5 4658} 4659 4660define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4661; SSE-LABEL: mul_add_self_v4i64_v4i32: 4662; SSE: # %bb.0: 4663; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4664; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4665; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4666; SSE-NEXT: pmuludq %xmm2, %xmm0 4667; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4668; SSE-NEXT: pmuludq %xmm3, %xmm1 4669; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4670; SSE-NEXT: paddd %xmm0, %xmm0 4671; SSE-NEXT: retq 4672; 4673; AVX-LABEL: mul_add_self_v4i64_v4i32: 4674; AVX: # %bb.0: 4675; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4676; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 4677; AVX-NEXT: retq 4678 %1 = sext <4 x i32> %a0 to <4 x i64> 4679 %2 = sext <4 x i32> %a1 to <4 x i64> 4680 %3 = mul <4 x i64> %1, %2 4681 %4 = add <4 x i64> %3, %3 4682 %5 = trunc <4 x i64> %4 to <4 x i32> 4683 ret <4 x i32> %5 4684} 4685 4686define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4687; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 4688; SSE: # %bb.0: 4689; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4690; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 4691; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 4692; SSE-NEXT: pmuludq %xmm2, %xmm4 4693; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 4694; SSE-NEXT: pmuludq %xmm3, %xmm1 4695; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] 4696; SSE-NEXT: paddd %xmm4, %xmm0 4697; SSE-NEXT: retq 4698; 4699; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 4700; AVX: # %bb.0: 4701; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 4702; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 4703; AVX-NEXT: retq 4704 %1 = sext <4 x i32> %a0 to <4 x i64> 4705 %2 = sext <4 x i32> %a1 to <4 x i64> 4706 %3 = mul <4 x i64> %1, %2 4707 %4 = add <4 x i64> %1, %3 4708 %5 = trunc <4 x i64> %4 to <4 x i32> 4709 ret <4 x i32> %5 4710} 4711