1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST-PERLANE 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 12 13; 14; add 15; 16 17define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 18; SSE-LABEL: trunc_add_v4i64_v4i32: 19; SSE: # %bb.0: 20; SSE-NEXT: paddq %xmm3, %xmm1 21; SSE-NEXT: paddq %xmm2, %xmm0 22; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 23; SSE-NEXT: retq 24; 25; AVX1-LABEL: trunc_add_v4i64_v4i32: 26; AVX1: # %bb.0: 27; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 28; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 29; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 30; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 31; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 32; AVX1-NEXT: vzeroupper 33; AVX1-NEXT: retq 34; 35; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: 36; AVX2-SLOW: # %bb.0: 37; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 38; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 39; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 40; AVX2-SLOW-NEXT: vzeroupper 41; AVX2-SLOW-NEXT: retq 42; 43; AVX2-FAST-ALL-LABEL: trunc_add_v4i64_v4i32: 44; AVX2-FAST-ALL: # %bb.0: 45; AVX2-FAST-ALL-NEXT: vpaddq %ymm1, %ymm0, %ymm0 46; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 47; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 48; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 49; AVX2-FAST-ALL-NEXT: vzeroupper 50; AVX2-FAST-ALL-NEXT: retq 51; 52; AVX2-FAST-PERLANE-LABEL: trunc_add_v4i64_v4i32: 53; AVX2-FAST-PERLANE: # %bb.0: 54; AVX2-FAST-PERLANE-NEXT: vpaddq %ymm1, %ymm0, %ymm0 55; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 56; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 57; AVX2-FAST-PERLANE-NEXT: vzeroupper 58; AVX2-FAST-PERLANE-NEXT: retq 59; 60; AVX512-LABEL: trunc_add_v4i64_v4i32: 61; AVX512: # %bb.0: 62; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 63; AVX512-NEXT: vpmovqd %zmm0, %ymm0 64; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 65; AVX512-NEXT: vzeroupper 66; AVX512-NEXT: retq 67 %1 = add <4 x i64> %a0, %a1 68 %2 = trunc <4 x i64> %1 to <4 x i32> 69 ret <4 x i32> %2 70} 71 72define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 73; SSE-LABEL: trunc_add_v8i64_v8i16: 74; SSE: # %bb.0: 75; SSE-NEXT: paddq %xmm6, %xmm2 76; SSE-NEXT: paddq %xmm7, %xmm3 77; SSE-NEXT: paddq %xmm4, %xmm0 78; SSE-NEXT: paddq %xmm5, %xmm1 79; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 80; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 81; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 82; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 83; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 84; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 85; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 86; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 87; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 88; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 89; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 90; SSE-NEXT: retq 91; 92; AVX1-LABEL: trunc_add_v8i64_v8i16: 93; AVX1: # %bb.0: 94; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 95; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 96; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 97; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 98; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 99; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 100; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 101; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 102; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 103; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 104; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 105; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 106; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 107; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 108; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 109; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 110; AVX1-NEXT: vzeroupper 111; AVX1-NEXT: retq 112; 113; AVX2-LABEL: trunc_add_v8i64_v8i16: 114; AVX2: # %bb.0: 115; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 116; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 117; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 118; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 119; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 120; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 121; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 122; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 123; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 124; AVX2-NEXT: vzeroupper 125; AVX2-NEXT: retq 126; 127; AVX512-LABEL: trunc_add_v8i64_v8i16: 128; AVX512: # %bb.0: 129; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 130; AVX512-NEXT: vpmovqw %zmm0, %xmm0 131; AVX512-NEXT: vzeroupper 132; AVX512-NEXT: retq 133 %1 = add <8 x i64> %a0, %a1 134 %2 = trunc <8 x i64> %1 to <8 x i16> 135 ret <8 x i16> %2 136} 137 138define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 139; SSE-LABEL: trunc_add_v8i32_v8i16: 140; SSE: # %bb.0: 141; SSE-NEXT: paddd %xmm2, %xmm0 142; SSE-NEXT: paddd %xmm3, %xmm1 143; SSE-NEXT: pslld $16, %xmm1 144; SSE-NEXT: psrad $16, %xmm1 145; SSE-NEXT: pslld $16, %xmm0 146; SSE-NEXT: psrad $16, %xmm0 147; SSE-NEXT: packssdw %xmm1, %xmm0 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: trunc_add_v8i32_v8i16: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 153; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 154; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 155; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 156; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 157; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 158; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 159; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 160; AVX1-NEXT: vzeroupper 161; AVX1-NEXT: retq 162; 163; AVX2-LABEL: trunc_add_v8i32_v8i16: 164; AVX2: # %bb.0: 165; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 166; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 167; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 168; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 169; AVX2-NEXT: vzeroupper 170; AVX2-NEXT: retq 171; 172; AVX512-LABEL: trunc_add_v8i32_v8i16: 173; AVX512: # %bb.0: 174; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 175; AVX512-NEXT: vpmovdw %zmm0, %ymm0 176; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 177; AVX512-NEXT: vzeroupper 178; AVX512-NEXT: retq 179 %1 = add <8 x i32> %a0, %a1 180 %2 = trunc <8 x i32> %1 to <8 x i16> 181 ret <8 x i16> %2 182} 183 184define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 185; SSE-LABEL: trunc_add_v16i64_v16i8: 186; SSE: # %bb.0: 187; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 188; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 189; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 190; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 191; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 192; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 193; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 194; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 195; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 196; SSE-NEXT: pand %xmm8, %xmm7 197; SSE-NEXT: pand %xmm8, %xmm6 198; SSE-NEXT: packuswb %xmm7, %xmm6 199; SSE-NEXT: pand %xmm8, %xmm5 200; SSE-NEXT: pand %xmm8, %xmm4 201; SSE-NEXT: packuswb %xmm5, %xmm4 202; SSE-NEXT: packuswb %xmm6, %xmm4 203; SSE-NEXT: pand %xmm8, %xmm3 204; SSE-NEXT: pand %xmm8, %xmm2 205; SSE-NEXT: packuswb %xmm3, %xmm2 206; SSE-NEXT: pand %xmm8, %xmm1 207; SSE-NEXT: pand %xmm8, %xmm0 208; SSE-NEXT: packuswb %xmm1, %xmm0 209; SSE-NEXT: packuswb %xmm2, %xmm0 210; SSE-NEXT: packuswb %xmm4, %xmm0 211; SSE-NEXT: retq 212; 213; AVX1-LABEL: trunc_add_v16i64_v16i8: 214; AVX1: # %bb.0: 215; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 216; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 217; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 218; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 219; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 220; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 221; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 222; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 223; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 224; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 225; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 226; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 227; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 228; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 229; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 230; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 231; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 232; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 233; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 234; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 235; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 236; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 237; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 238; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 239; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 240; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 241; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 242; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 243; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 244; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 245; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 246; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 247; AVX1-NEXT: vzeroupper 248; AVX1-NEXT: retq 249; 250; AVX2-LABEL: trunc_add_v16i64_v16i8: 251; AVX2: # %bb.0: 252; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 253; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 254; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 255; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 256; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 257; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 258; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 259; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 260; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 261; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 262; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 263; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 264; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 265; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 266; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 267; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 268; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 269; AVX2-NEXT: vzeroupper 270; AVX2-NEXT: retq 271; 272; AVX512-LABEL: trunc_add_v16i64_v16i8: 273; AVX512: # %bb.0: 274; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 275; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 276; AVX512-NEXT: vpmovqb %zmm1, %xmm1 277; AVX512-NEXT: vpmovqb %zmm0, %xmm0 278; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 279; AVX512-NEXT: vzeroupper 280; AVX512-NEXT: retq 281 %1 = add <16 x i64> %a0, %a1 282 %2 = trunc <16 x i64> %1 to <16 x i8> 283 ret <16 x i8> %2 284} 285 286define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 287; SSE-LABEL: trunc_add_v16i32_v16i8: 288; SSE: # %bb.0: 289; SSE-NEXT: paddd %xmm4, %xmm0 290; SSE-NEXT: paddd %xmm5, %xmm1 291; SSE-NEXT: paddd %xmm6, %xmm2 292; SSE-NEXT: paddd %xmm7, %xmm3 293; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 294; SSE-NEXT: pand %xmm4, %xmm3 295; SSE-NEXT: pand %xmm4, %xmm2 296; SSE-NEXT: packuswb %xmm3, %xmm2 297; SSE-NEXT: pand %xmm4, %xmm1 298; SSE-NEXT: pand %xmm4, %xmm0 299; SSE-NEXT: packuswb %xmm1, %xmm0 300; SSE-NEXT: packuswb %xmm2, %xmm0 301; SSE-NEXT: retq 302; 303; AVX1-LABEL: trunc_add_v16i32_v16i8: 304; AVX1: # %bb.0: 305; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 306; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 307; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 308; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 309; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 310; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 311; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 312; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 313; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 314; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 315; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 316; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 317; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 318; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 319; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 320; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 321; AVX1-NEXT: vzeroupper 322; AVX1-NEXT: retq 323; 324; AVX2-LABEL: trunc_add_v16i32_v16i8: 325; AVX2: # %bb.0: 326; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 327; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 328; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 329; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 330; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 331; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 332; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 333; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 334; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 335; AVX2-NEXT: vzeroupper 336; AVX2-NEXT: retq 337; 338; AVX512-LABEL: trunc_add_v16i32_v16i8: 339; AVX512: # %bb.0: 340; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 341; AVX512-NEXT: vpmovdb %zmm0, %xmm0 342; AVX512-NEXT: vzeroupper 343; AVX512-NEXT: retq 344 %1 = add <16 x i32> %a0, %a1 345 %2 = trunc <16 x i32> %1 to <16 x i8> 346 ret <16 x i8> %2 347} 348 349define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 350; SSE-LABEL: trunc_add_v16i16_v16i8: 351; SSE: # %bb.0: 352; SSE-NEXT: paddw %xmm2, %xmm0 353; SSE-NEXT: paddw %xmm3, %xmm1 354; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 355; SSE-NEXT: pand %xmm2, %xmm1 356; SSE-NEXT: pand %xmm2, %xmm0 357; SSE-NEXT: packuswb %xmm1, %xmm0 358; SSE-NEXT: retq 359; 360; AVX1-LABEL: trunc_add_v16i16_v16i8: 361; AVX1: # %bb.0: 362; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 363; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 364; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 365; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 366; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 367; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 368; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 369; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 370; AVX1-NEXT: vzeroupper 371; AVX1-NEXT: retq 372; 373; AVX2-LABEL: trunc_add_v16i16_v16i8: 374; AVX2: # %bb.0: 375; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 376; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 377; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 378; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 379; AVX2-NEXT: vzeroupper 380; AVX2-NEXT: retq 381; 382; AVX512F-LABEL: trunc_add_v16i16_v16i8: 383; AVX512F: # %bb.0: 384; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 385; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 386; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 387; AVX512F-NEXT: vzeroupper 388; AVX512F-NEXT: retq 389; 390; AVX512BW-LABEL: trunc_add_v16i16_v16i8: 391; AVX512BW: # %bb.0: 392; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 393; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 394; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 395; AVX512BW-NEXT: vzeroupper 396; AVX512BW-NEXT: retq 397; 398; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: 399; AVX512DQ: # %bb.0: 400; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 401; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 402; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 403; AVX512DQ-NEXT: vzeroupper 404; AVX512DQ-NEXT: retq 405 %1 = add <16 x i16> %a0, %a1 406 %2 = trunc <16 x i16> %1 to <16 x i8> 407 ret <16 x i8> %2 408} 409 410define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 411; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 412; SSE: # %bb.0: 413; SSE-NEXT: pslld $16, %xmm2 414; SSE-NEXT: psrad $16, %xmm2 415; SSE-NEXT: pslld $16, %xmm1 416; SSE-NEXT: psrad $16, %xmm1 417; SSE-NEXT: packssdw %xmm2, %xmm1 418; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 419; SSE-NEXT: psraw $8, %xmm0 420; SSE-NEXT: paddw %xmm1, %xmm0 421; SSE-NEXT: retq 422; 423; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 426; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 427; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 428; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 429; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 430; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 431; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 432; AVX1-NEXT: vzeroupper 433; AVX1-NEXT: retq 434; 435; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 436; AVX2: # %bb.0: 437; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 438; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 439; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 440; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 441; AVX2-NEXT: vzeroupper 442; AVX2-NEXT: retq 443; 444; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: 445; AVX512: # %bb.0: 446; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 447; AVX512-NEXT: vpmovdw %zmm1, %ymm1 448; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 449; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 450; AVX512-NEXT: vzeroupper 451; AVX512-NEXT: retq 452 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 453 %2 = sext <8 x i8> %1 to <8 x i32> 454 %3 = add <8 x i32> %2, %a1 455 %4 = trunc <8 x i32> %3 to <8 x i16> 456 ret <8 x i16> %4 457} 458 459; 460; add to constant 461; 462 463define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 464; SSE-LABEL: trunc_add_const_v4i64_v4i32: 465; SSE: # %bb.0: 466; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 467; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 468; SSE-NEXT: retq 469; 470; AVX1-LABEL: trunc_add_const_v4i64_v4i32: 471; AVX1: # %bb.0: 472; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 473; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 474; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 475; AVX1-NEXT: vzeroupper 476; AVX1-NEXT: retq 477; 478; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: 479; AVX2-SLOW: # %bb.0: 480; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 481; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 482; AVX2-SLOW-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 483; AVX2-SLOW-NEXT: vzeroupper 484; AVX2-SLOW-NEXT: retq 485; 486; AVX2-FAST-ALL-LABEL: trunc_add_const_v4i64_v4i32: 487; AVX2-FAST-ALL: # %bb.0: 488; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 489; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 490; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 491; AVX2-FAST-ALL-NEXT: vzeroupper 492; AVX2-FAST-ALL-NEXT: retq 493; 494; AVX2-FAST-PERLANE-LABEL: trunc_add_const_v4i64_v4i32: 495; AVX2-FAST-PERLANE: # %bb.0: 496; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 497; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 498; AVX2-FAST-PERLANE-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 499; AVX2-FAST-PERLANE-NEXT: vzeroupper 500; AVX2-FAST-PERLANE-NEXT: retq 501; 502; AVX512-LABEL: trunc_add_const_v4i64_v4i32: 503; AVX512: # %bb.0: 504; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 505; AVX512-NEXT: vpmovqd %zmm0, %ymm0 506; AVX512-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 507; AVX512-NEXT: vzeroupper 508; AVX512-NEXT: retq 509 %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 510 %2 = trunc <4 x i64> %1 to <4 x i32> 511 ret <4 x i32> %2 512} 513 514define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 515; SSE-LABEL: trunc_add_const_v8i64_v8i16: 516; SSE: # %bb.0: 517; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 518; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 519; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 520; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 521; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 522; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 523; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 524; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 525; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 526; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 527; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 528; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 529; SSE-NEXT: retq 530; 531; AVX1-LABEL: trunc_add_const_v8i64_v8i16: 532; AVX1: # %bb.0: 533; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 534; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 535; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 536; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 537; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 538; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 539; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 540; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 541; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 542; AVX1-NEXT: vzeroupper 543; AVX1-NEXT: retq 544; 545; AVX2-LABEL: trunc_add_const_v8i64_v8i16: 546; AVX2: # %bb.0: 547; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 548; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 549; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 550; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 551; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 552; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 553; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 554; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 555; AVX2-NEXT: vzeroupper 556; AVX2-NEXT: retq 557; 558; AVX512-LABEL: trunc_add_const_v8i64_v8i16: 559; AVX512: # %bb.0: 560; AVX512-NEXT: vpmovqw %zmm0, %xmm0 561; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 562; AVX512-NEXT: vzeroupper 563; AVX512-NEXT: retq 564 %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 565 %2 = trunc <8 x i64> %1 to <8 x i16> 566 ret <8 x i16> %2 567} 568 569define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 570; SSE-LABEL: trunc_add_const_v8i32_v8i16: 571; SSE: # %bb.0: 572; SSE-NEXT: pslld $16, %xmm1 573; SSE-NEXT: psrad $16, %xmm1 574; SSE-NEXT: pslld $16, %xmm0 575; SSE-NEXT: psrad $16, %xmm0 576; SSE-NEXT: packssdw %xmm1, %xmm0 577; SSE-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 578; SSE-NEXT: retq 579; 580; AVX1-LABEL: trunc_add_const_v8i32_v8i16: 581; AVX1: # %bb.0: 582; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 583; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 584; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 585; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 586; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 587; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 588; AVX1-NEXT: vzeroupper 589; AVX1-NEXT: retq 590; 591; AVX2-LABEL: trunc_add_const_v8i32_v8i16: 592; AVX2: # %bb.0: 593; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 594; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 595; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 596; AVX2-NEXT: vzeroupper 597; AVX2-NEXT: retq 598; 599; AVX512-LABEL: trunc_add_const_v8i32_v8i16: 600; AVX512: # %bb.0: 601; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 602; AVX512-NEXT: vpmovdw %zmm0, %ymm0 603; AVX512-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 604; AVX512-NEXT: vzeroupper 605; AVX512-NEXT: retq 606 %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 607 %2 = trunc <8 x i32> %1 to <8 x i16> 608 ret <8 x i16> %2 609} 610 611define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 612; SSE-LABEL: trunc_add_const_v16i64_v16i8: 613; SSE: # %bb.0: 614; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 615; SSE-NEXT: pand %xmm8, %xmm7 616; SSE-NEXT: pand %xmm8, %xmm6 617; SSE-NEXT: packuswb %xmm7, %xmm6 618; SSE-NEXT: pand %xmm8, %xmm5 619; SSE-NEXT: pand %xmm8, %xmm4 620; SSE-NEXT: packuswb %xmm5, %xmm4 621; SSE-NEXT: packuswb %xmm6, %xmm4 622; SSE-NEXT: pand %xmm8, %xmm3 623; SSE-NEXT: pand %xmm8, %xmm2 624; SSE-NEXT: packuswb %xmm3, %xmm2 625; SSE-NEXT: pand %xmm8, %xmm1 626; SSE-NEXT: pand %xmm8, %xmm0 627; SSE-NEXT: packuswb %xmm1, %xmm0 628; SSE-NEXT: packuswb %xmm2, %xmm0 629; SSE-NEXT: packuswb %xmm4, %xmm0 630; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 631; SSE-NEXT: retq 632; 633; AVX1-LABEL: trunc_add_const_v16i64_v16i8: 634; AVX1: # %bb.0: 635; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 636; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 637; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 638; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 639; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 640; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 641; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 642; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 643; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 644; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 645; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 646; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 647; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 648; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 649; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 650; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 651; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 652; AVX1-NEXT: vzeroupper 653; AVX1-NEXT: retq 654; 655; AVX2-LABEL: trunc_add_const_v16i64_v16i8: 656; AVX2: # %bb.0: 657; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 658; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 659; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 660; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 661; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 662; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 663; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 664; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 665; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 666; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 667; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 668; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 669; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 670; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 671; AVX2-NEXT: vzeroupper 672; AVX2-NEXT: retq 673; 674; AVX512-LABEL: trunc_add_const_v16i64_v16i8: 675; AVX512: # %bb.0: 676; AVX512-NEXT: vpmovqb %zmm1, %xmm1 677; AVX512-NEXT: vpmovqb %zmm0, %xmm0 678; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 679; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 680; AVX512-NEXT: vzeroupper 681; AVX512-NEXT: retq 682 %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 683 %2 = trunc <16 x i64> %1 to <16 x i8> 684 ret <16 x i8> %2 685} 686 687define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 688; SSE-LABEL: trunc_add_const_v16i32_v16i8: 689; SSE: # %bb.0: 690; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 691; SSE-NEXT: pand %xmm4, %xmm3 692; SSE-NEXT: pand %xmm4, %xmm2 693; SSE-NEXT: packuswb %xmm3, %xmm2 694; SSE-NEXT: pand %xmm4, %xmm1 695; SSE-NEXT: pand %xmm4, %xmm0 696; SSE-NEXT: packuswb %xmm1, %xmm0 697; SSE-NEXT: packuswb %xmm2, %xmm0 698; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 699; SSE-NEXT: retq 700; 701; AVX1-LABEL: trunc_add_const_v16i32_v16i8: 702; AVX1: # %bb.0: 703; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 704; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 705; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 706; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 707; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 708; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 709; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 710; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 711; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 712; AVX1-NEXT: vzeroupper 713; AVX1-NEXT: retq 714; 715; AVX2-LABEL: trunc_add_const_v16i32_v16i8: 716; AVX2: # %bb.0: 717; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 718; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 719; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 720; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 721; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 722; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 723; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 724; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 725; AVX2-NEXT: vzeroupper 726; AVX2-NEXT: retq 727; 728; AVX512-LABEL: trunc_add_const_v16i32_v16i8: 729; AVX512: # %bb.0: 730; AVX512-NEXT: vpmovdb %zmm0, %xmm0 731; AVX512-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 732; AVX512-NEXT: vzeroupper 733; AVX512-NEXT: retq 734 %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 735 %2 = trunc <16 x i32> %1 to <16 x i8> 736 ret <16 x i8> %2 737} 738 739define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 740; SSE-LABEL: trunc_add_const_v16i16_v16i8: 741; SSE: # %bb.0: 742; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 743; SSE-NEXT: pand %xmm2, %xmm1 744; SSE-NEXT: pand %xmm2, %xmm0 745; SSE-NEXT: packuswb %xmm1, %xmm0 746; SSE-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 747; SSE-NEXT: retq 748; 749; AVX1-LABEL: trunc_add_const_v16i16_v16i8: 750; AVX1: # %bb.0: 751; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 752; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 753; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 754; AVX1-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 755; AVX1-NEXT: vzeroupper 756; AVX1-NEXT: retq 757; 758; AVX2-LABEL: trunc_add_const_v16i16_v16i8: 759; AVX2: # %bb.0: 760; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 761; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 762; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 763; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 764; AVX2-NEXT: vzeroupper 765; AVX2-NEXT: retq 766; 767; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: 768; AVX512F: # %bb.0: 769; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 770; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 771; AVX512F-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 772; AVX512F-NEXT: vzeroupper 773; AVX512F-NEXT: retq 774; 775; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: 776; AVX512BW: # %bb.0: 777; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 778; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 779; AVX512BW-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 780; AVX512BW-NEXT: vzeroupper 781; AVX512BW-NEXT: retq 782; 783; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: 784; AVX512DQ: # %bb.0: 785; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 786; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 787; AVX512DQ-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 788; AVX512DQ-NEXT: vzeroupper 789; AVX512DQ-NEXT: retq 790 %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 791 %2 = trunc <16 x i16> %1 to <16 x i8> 792 ret <16 x i8> %2 793} 794 795; 796; sub 797; 798 799define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 800; SSE-LABEL: trunc_sub_v4i64_v4i32: 801; SSE: # %bb.0: 802; SSE-NEXT: psubq %xmm3, %xmm1 803; SSE-NEXT: psubq %xmm2, %xmm0 804; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 805; SSE-NEXT: retq 806; 807; AVX1-LABEL: trunc_sub_v4i64_v4i32: 808; AVX1: # %bb.0: 809; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 810; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 811; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 812; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 813; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 814; AVX1-NEXT: vzeroupper 815; AVX1-NEXT: retq 816; 817; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: 818; AVX2-SLOW: # %bb.0: 819; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 820; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 821; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 822; AVX2-SLOW-NEXT: vzeroupper 823; AVX2-SLOW-NEXT: retq 824; 825; AVX2-FAST-ALL-LABEL: trunc_sub_v4i64_v4i32: 826; AVX2-FAST-ALL: # %bb.0: 827; AVX2-FAST-ALL-NEXT: vpsubq %ymm1, %ymm0, %ymm0 828; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 829; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 830; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 831; AVX2-FAST-ALL-NEXT: vzeroupper 832; AVX2-FAST-ALL-NEXT: retq 833; 834; AVX2-FAST-PERLANE-LABEL: trunc_sub_v4i64_v4i32: 835; AVX2-FAST-PERLANE: # %bb.0: 836; AVX2-FAST-PERLANE-NEXT: vpsubq %ymm1, %ymm0, %ymm0 837; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 838; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 839; AVX2-FAST-PERLANE-NEXT: vzeroupper 840; AVX2-FAST-PERLANE-NEXT: retq 841; 842; AVX512-LABEL: trunc_sub_v4i64_v4i32: 843; AVX512: # %bb.0: 844; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 845; AVX512-NEXT: vpmovqd %zmm0, %ymm0 846; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 847; AVX512-NEXT: vzeroupper 848; AVX512-NEXT: retq 849 %1 = sub <4 x i64> %a0, %a1 850 %2 = trunc <4 x i64> %1 to <4 x i32> 851 ret <4 x i32> %2 852} 853 854define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 855; SSE-LABEL: trunc_sub_v8i64_v8i16: 856; SSE: # %bb.0: 857; SSE-NEXT: psubq %xmm6, %xmm2 858; SSE-NEXT: psubq %xmm7, %xmm3 859; SSE-NEXT: psubq %xmm4, %xmm0 860; SSE-NEXT: psubq %xmm5, %xmm1 861; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 862; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 863; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 864; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 865; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 866; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 867; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 868; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 869; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 870; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 871; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 872; SSE-NEXT: retq 873; 874; AVX1-LABEL: trunc_sub_v8i64_v8i16: 875; AVX1: # %bb.0: 876; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 877; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 878; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 879; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 880; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 881; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 882; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 883; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 884; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 885; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] 886; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] 887; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 888; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] 889; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] 890; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 891; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 892; AVX1-NEXT: vzeroupper 893; AVX1-NEXT: retq 894; 895; AVX2-LABEL: trunc_sub_v8i64_v8i16: 896; AVX2: # %bb.0: 897; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 898; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 899; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 900; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 901; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 902; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 903; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 904; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 905; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 906; AVX2-NEXT: vzeroupper 907; AVX2-NEXT: retq 908; 909; AVX512-LABEL: trunc_sub_v8i64_v8i16: 910; AVX512: # %bb.0: 911; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 912; AVX512-NEXT: vpmovqw %zmm0, %xmm0 913; AVX512-NEXT: vzeroupper 914; AVX512-NEXT: retq 915 %1 = sub <8 x i64> %a0, %a1 916 %2 = trunc <8 x i64> %1 to <8 x i16> 917 ret <8 x i16> %2 918} 919 920define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 921; SSE-LABEL: trunc_sub_v8i32_v8i16: 922; SSE: # %bb.0: 923; SSE-NEXT: psubd %xmm2, %xmm0 924; SSE-NEXT: psubd %xmm3, %xmm1 925; SSE-NEXT: pslld $16, %xmm1 926; SSE-NEXT: psrad $16, %xmm1 927; SSE-NEXT: pslld $16, %xmm0 928; SSE-NEXT: psrad $16, %xmm0 929; SSE-NEXT: packssdw %xmm1, %xmm0 930; SSE-NEXT: retq 931; 932; AVX1-LABEL: trunc_sub_v8i32_v8i16: 933; AVX1: # %bb.0: 934; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 935; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 936; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 937; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 938; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 939; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 940; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 941; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 942; AVX1-NEXT: vzeroupper 943; AVX1-NEXT: retq 944; 945; AVX2-LABEL: trunc_sub_v8i32_v8i16: 946; AVX2: # %bb.0: 947; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 948; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 949; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 950; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 951; AVX2-NEXT: vzeroupper 952; AVX2-NEXT: retq 953; 954; AVX512-LABEL: trunc_sub_v8i32_v8i16: 955; AVX512: # %bb.0: 956; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 957; AVX512-NEXT: vpmovdw %zmm0, %ymm0 958; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 959; AVX512-NEXT: vzeroupper 960; AVX512-NEXT: retq 961 %1 = sub <8 x i32> %a0, %a1 962 %2 = trunc <8 x i32> %1 to <8 x i16> 963 ret <8 x i16> %2 964} 965 966define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 967; SSE-LABEL: trunc_sub_v16i64_v16i8: 968; SSE: # %bb.0: 969; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 970; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 971; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 972; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 973; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 974; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 975; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 976; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 977; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 978; SSE-NEXT: pand %xmm8, %xmm7 979; SSE-NEXT: pand %xmm8, %xmm6 980; SSE-NEXT: packuswb %xmm7, %xmm6 981; SSE-NEXT: pand %xmm8, %xmm5 982; SSE-NEXT: pand %xmm8, %xmm4 983; SSE-NEXT: packuswb %xmm5, %xmm4 984; SSE-NEXT: packuswb %xmm6, %xmm4 985; SSE-NEXT: pand %xmm8, %xmm3 986; SSE-NEXT: pand %xmm8, %xmm2 987; SSE-NEXT: packuswb %xmm3, %xmm2 988; SSE-NEXT: pand %xmm8, %xmm1 989; SSE-NEXT: pand %xmm8, %xmm0 990; SSE-NEXT: packuswb %xmm1, %xmm0 991; SSE-NEXT: packuswb %xmm2, %xmm0 992; SSE-NEXT: packuswb %xmm4, %xmm0 993; SSE-NEXT: retq 994; 995; AVX1-LABEL: trunc_sub_v16i64_v16i8: 996; AVX1: # %bb.0: 997; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 998; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 999; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1000; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 1001; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 1002; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1003; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1004; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 1005; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 1006; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1007; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1008; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 1009; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 1010; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1011; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1012; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 1013; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1014; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1015; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1016; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1017; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1018; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1019; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1020; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1021; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1022; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1023; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1024; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1025; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1026; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1027; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1028; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1029; AVX1-NEXT: vzeroupper 1030; AVX1-NEXT: retq 1031; 1032; AVX2-LABEL: trunc_sub_v16i64_v16i8: 1033; AVX2: # %bb.0: 1034; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 1035; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 1036; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 1037; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 1038; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1039; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1040; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1041; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1042; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1043; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1044; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1045; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1046; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1047; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1048; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1049; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1050; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1051; AVX2-NEXT: vzeroupper 1052; AVX2-NEXT: retq 1053; 1054; AVX512-LABEL: trunc_sub_v16i64_v16i8: 1055; AVX512: # %bb.0: 1056; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 1057; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 1058; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1059; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1060; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1061; AVX512-NEXT: vzeroupper 1062; AVX512-NEXT: retq 1063 %1 = sub <16 x i64> %a0, %a1 1064 %2 = trunc <16 x i64> %1 to <16 x i8> 1065 ret <16 x i8> %2 1066} 1067 1068define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1069; SSE-LABEL: trunc_sub_v16i32_v16i8: 1070; SSE: # %bb.0: 1071; SSE-NEXT: psubd %xmm4, %xmm0 1072; SSE-NEXT: psubd %xmm5, %xmm1 1073; SSE-NEXT: psubd %xmm6, %xmm2 1074; SSE-NEXT: psubd %xmm7, %xmm3 1075; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1076; SSE-NEXT: pand %xmm4, %xmm3 1077; SSE-NEXT: pand %xmm4, %xmm2 1078; SSE-NEXT: packuswb %xmm3, %xmm2 1079; SSE-NEXT: pand %xmm4, %xmm1 1080; SSE-NEXT: pand %xmm4, %xmm0 1081; SSE-NEXT: packuswb %xmm1, %xmm0 1082; SSE-NEXT: packuswb %xmm2, %xmm0 1083; SSE-NEXT: retq 1084; 1085; AVX1-LABEL: trunc_sub_v16i32_v16i8: 1086; AVX1: # %bb.0: 1087; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 1088; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1089; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1090; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 1091; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 1092; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1093; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1094; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 1095; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1096; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1097; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1098; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1099; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1100; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1101; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1102; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1103; AVX1-NEXT: vzeroupper 1104; AVX1-NEXT: retq 1105; 1106; AVX2-LABEL: trunc_sub_v16i32_v16i8: 1107; AVX2: # %bb.0: 1108; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 1109; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 1110; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1111; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1112; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1113; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1114; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1115; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1116; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1117; AVX2-NEXT: vzeroupper 1118; AVX2-NEXT: retq 1119; 1120; AVX512-LABEL: trunc_sub_v16i32_v16i8: 1121; AVX512: # %bb.0: 1122; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 1123; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1124; AVX512-NEXT: vzeroupper 1125; AVX512-NEXT: retq 1126 %1 = sub <16 x i32> %a0, %a1 1127 %2 = trunc <16 x i32> %1 to <16 x i8> 1128 ret <16 x i8> %2 1129} 1130 1131define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 1132; SSE-LABEL: trunc_sub_v16i16_v16i8: 1133; SSE: # %bb.0: 1134; SSE-NEXT: psubw %xmm2, %xmm0 1135; SSE-NEXT: psubw %xmm3, %xmm1 1136; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1137; SSE-NEXT: pand %xmm2, %xmm1 1138; SSE-NEXT: pand %xmm2, %xmm0 1139; SSE-NEXT: packuswb %xmm1, %xmm0 1140; SSE-NEXT: retq 1141; 1142; AVX1-LABEL: trunc_sub_v16i16_v16i8: 1143; AVX1: # %bb.0: 1144; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 1145; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1146; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1147; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 1148; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 1149; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 1150; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 1151; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 1152; AVX1-NEXT: vzeroupper 1153; AVX1-NEXT: retq 1154; 1155; AVX2-LABEL: trunc_sub_v16i16_v16i8: 1156; AVX2: # %bb.0: 1157; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1158; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1159; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1160; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1161; AVX2-NEXT: vzeroupper 1162; AVX2-NEXT: retq 1163; 1164; AVX512F-LABEL: trunc_sub_v16i16_v16i8: 1165; AVX512F: # %bb.0: 1166; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1167; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1168; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1169; AVX512F-NEXT: vzeroupper 1170; AVX512F-NEXT: retq 1171; 1172; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: 1173; AVX512BW: # %bb.0: 1174; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1175; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1176; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1177; AVX512BW-NEXT: vzeroupper 1178; AVX512BW-NEXT: retq 1179; 1180; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: 1181; AVX512DQ: # %bb.0: 1182; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 1183; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1184; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1185; AVX512DQ-NEXT: vzeroupper 1186; AVX512DQ-NEXT: retq 1187 %1 = sub <16 x i16> %a0, %a1 1188 %2 = trunc <16 x i16> %1 to <16 x i8> 1189 ret <16 x i8> %2 1190} 1191 1192define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { 1193; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: 1194; SSE: # %bb.0: 1195; SSE-NEXT: psubb %xmm1, %xmm0 1196; SSE-NEXT: retq 1197; 1198; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: 1199; AVX: # %bb.0: 1200; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 1201; AVX-NEXT: retq 1202 %a = zext <16 x i8> %x to <16 x i16> 1203 %b = zext <16 x i8> %y to <16 x i16> 1204 %c = sub <16 x i16> %a, %b 1205 %d = trunc <16 x i16> %c to <16 x i8> 1206 ret <16 x i8> %d 1207} 1208 1209; 1210; sub to constant 1211; 1212 1213define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 1214; SSE-LABEL: trunc_sub_const_v4i64_v4i32: 1215; SSE: # %bb.0: 1216; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1217; SSE-NEXT: psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1218; SSE-NEXT: retq 1219; 1220; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: 1221; AVX1: # %bb.0: 1222; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1223; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1224; AVX1-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1225; AVX1-NEXT: vzeroupper 1226; AVX1-NEXT: retq 1227; 1228; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: 1229; AVX2-SLOW: # %bb.0: 1230; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 1231; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1232; AVX2-SLOW-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1233; AVX2-SLOW-NEXT: vzeroupper 1234; AVX2-SLOW-NEXT: retq 1235; 1236; AVX2-FAST-ALL-LABEL: trunc_sub_const_v4i64_v4i32: 1237; AVX2-FAST-ALL: # %bb.0: 1238; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 1239; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 1240; AVX2-FAST-ALL-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1241; AVX2-FAST-ALL-NEXT: vzeroupper 1242; AVX2-FAST-ALL-NEXT: retq 1243; 1244; AVX2-FAST-PERLANE-LABEL: trunc_sub_const_v4i64_v4i32: 1245; AVX2-FAST-PERLANE: # %bb.0: 1246; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 1247; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1248; AVX2-FAST-PERLANE-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1249; AVX2-FAST-PERLANE-NEXT: vzeroupper 1250; AVX2-FAST-PERLANE-NEXT: retq 1251; 1252; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: 1253; AVX512: # %bb.0: 1254; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1255; AVX512-NEXT: vpmovqd %zmm0, %ymm0 1256; AVX512-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1257; AVX512-NEXT: vzeroupper 1258; AVX512-NEXT: retq 1259 %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 1260 %2 = trunc <4 x i64> %1 to <4 x i32> 1261 ret <4 x i32> %2 1262} 1263 1264define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 1265; SSE-LABEL: trunc_sub_const_v8i64_v8i16: 1266; SSE: # %bb.0: 1267; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1268; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1269; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1270; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1271; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1272; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1273; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1274; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1275; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1276; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1277; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1278; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1279; SSE-NEXT: retq 1280; 1281; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: 1282; AVX1: # %bb.0: 1283; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 1284; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1285; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1286; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1287; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1288; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1289; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1290; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1291; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1292; AVX1-NEXT: vzeroupper 1293; AVX1-NEXT: retq 1294; 1295; AVX2-LABEL: trunc_sub_const_v8i64_v8i16: 1296; AVX2: # %bb.0: 1297; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1298; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 1299; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 1300; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1302; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1303; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1304; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1305; AVX2-NEXT: vzeroupper 1306; AVX2-NEXT: retq 1307; 1308; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: 1309; AVX512: # %bb.0: 1310; AVX512-NEXT: vpmovqw %zmm0, %xmm0 1311; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1312; AVX512-NEXT: vzeroupper 1313; AVX512-NEXT: retq 1314 %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 1315 %2 = trunc <8 x i64> %1 to <8 x i16> 1316 ret <8 x i16> %2 1317} 1318 1319define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 1320; SSE-LABEL: trunc_sub_const_v8i32_v8i16: 1321; SSE: # %bb.0: 1322; SSE-NEXT: pslld $16, %xmm1 1323; SSE-NEXT: psrad $16, %xmm1 1324; SSE-NEXT: pslld $16, %xmm0 1325; SSE-NEXT: psrad $16, %xmm0 1326; SSE-NEXT: packssdw %xmm1, %xmm0 1327; SSE-NEXT: psubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1328; SSE-NEXT: retq 1329; 1330; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: 1331; AVX1: # %bb.0: 1332; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1333; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1334; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 1335; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 1336; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1337; AVX1-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1338; AVX1-NEXT: vzeroupper 1339; AVX1-NEXT: retq 1340; 1341; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: 1342; AVX2: # %bb.0: 1343; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1344; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1345; AVX2-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1346; AVX2-NEXT: vzeroupper 1347; AVX2-NEXT: retq 1348; 1349; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: 1350; AVX512: # %bb.0: 1351; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1352; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1353; AVX512-NEXT: vpsubw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1354; AVX512-NEXT: vzeroupper 1355; AVX512-NEXT: retq 1356 %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1357 %2 = trunc <8 x i32> %1 to <8 x i16> 1358 ret <8 x i16> %2 1359} 1360 1361define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 1362; SSE-LABEL: trunc_sub_const_v16i64_v16i8: 1363; SSE: # %bb.0: 1364; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1365; SSE-NEXT: pand %xmm8, %xmm7 1366; SSE-NEXT: pand %xmm8, %xmm6 1367; SSE-NEXT: packuswb %xmm7, %xmm6 1368; SSE-NEXT: pand %xmm8, %xmm5 1369; SSE-NEXT: pand %xmm8, %xmm4 1370; SSE-NEXT: packuswb %xmm5, %xmm4 1371; SSE-NEXT: packuswb %xmm6, %xmm4 1372; SSE-NEXT: pand %xmm8, %xmm3 1373; SSE-NEXT: pand %xmm8, %xmm2 1374; SSE-NEXT: packuswb %xmm3, %xmm2 1375; SSE-NEXT: pand %xmm8, %xmm1 1376; SSE-NEXT: pand %xmm8, %xmm0 1377; SSE-NEXT: packuswb %xmm1, %xmm0 1378; SSE-NEXT: packuswb %xmm2, %xmm0 1379; SSE-NEXT: packuswb %xmm4, %xmm0 1380; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1381; SSE-NEXT: retq 1382; 1383; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: 1384; AVX1: # %bb.0: 1385; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 1386; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1387; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1388; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1389; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1390; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1391; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1392; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1393; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1394; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1395; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1396; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1397; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1398; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1399; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1400; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1401; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1402; AVX1-NEXT: vzeroupper 1403; AVX1-NEXT: retq 1404; 1405; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: 1406; AVX2: # %bb.0: 1407; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1408; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1409; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1410; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1411; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1412; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1413; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1414; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1415; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1416; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1417; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1418; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1419; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1420; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1421; AVX2-NEXT: vzeroupper 1422; AVX2-NEXT: retq 1423; 1424; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: 1425; AVX512: # %bb.0: 1426; AVX512-NEXT: vpmovqb %zmm1, %xmm1 1427; AVX512-NEXT: vpmovqb %zmm0, %xmm0 1428; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1429; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1430; AVX512-NEXT: vzeroupper 1431; AVX512-NEXT: retq 1432 %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 1433 %2 = trunc <16 x i64> %1 to <16 x i8> 1434 ret <16 x i8> %2 1435} 1436 1437define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 1438; SSE-LABEL: trunc_sub_const_v16i32_v16i8: 1439; SSE: # %bb.0: 1440; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1441; SSE-NEXT: pand %xmm4, %xmm3 1442; SSE-NEXT: pand %xmm4, %xmm2 1443; SSE-NEXT: packuswb %xmm3, %xmm2 1444; SSE-NEXT: pand %xmm4, %xmm1 1445; SSE-NEXT: pand %xmm4, %xmm0 1446; SSE-NEXT: packuswb %xmm1, %xmm0 1447; SSE-NEXT: packuswb %xmm2, %xmm0 1448; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1449; SSE-NEXT: retq 1450; 1451; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: 1452; AVX1: # %bb.0: 1453; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1454; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 1455; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1456; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1457; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 1458; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1459; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 1460; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1461; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1462; AVX1-NEXT: vzeroupper 1463; AVX1-NEXT: retq 1464; 1465; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: 1466; AVX2: # %bb.0: 1467; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 1468; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 1469; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 1470; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1471; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1472; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1473; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1474; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1475; AVX2-NEXT: vzeroupper 1476; AVX2-NEXT: retq 1477; 1478; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: 1479; AVX512: # %bb.0: 1480; AVX512-NEXT: vpmovdb %zmm0, %xmm0 1481; AVX512-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1482; AVX512-NEXT: vzeroupper 1483; AVX512-NEXT: retq 1484 %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1485 %2 = trunc <16 x i32> %1 to <16 x i8> 1486 ret <16 x i8> %2 1487} 1488 1489define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 1490; SSE-LABEL: trunc_sub_const_v16i16_v16i8: 1491; SSE: # %bb.0: 1492; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 1493; SSE-NEXT: pand %xmm2, %xmm1 1494; SSE-NEXT: pand %xmm2, %xmm0 1495; SSE-NEXT: packuswb %xmm1, %xmm0 1496; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1497; SSE-NEXT: retq 1498; 1499; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: 1500; AVX1: # %bb.0: 1501; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1502; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 1503; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1504; AVX1-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1505; AVX1-NEXT: vzeroupper 1506; AVX1-NEXT: retq 1507; 1508; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: 1509; AVX2: # %bb.0: 1510; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 1511; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1512; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1513; AVX2-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1514; AVX2-NEXT: vzeroupper 1515; AVX2-NEXT: retq 1516; 1517; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: 1518; AVX512F: # %bb.0: 1519; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1520; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1521; AVX512F-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1522; AVX512F-NEXT: vzeroupper 1523; AVX512F-NEXT: retq 1524; 1525; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: 1526; AVX512BW: # %bb.0: 1527; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1528; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1529; AVX512BW-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1530; AVX512BW-NEXT: vzeroupper 1531; AVX512BW-NEXT: retq 1532; 1533; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: 1534; AVX512DQ: # %bb.0: 1535; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1536; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 1537; AVX512DQ-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1538; AVX512DQ-NEXT: vzeroupper 1539; AVX512DQ-NEXT: retq 1540 %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1541 %2 = trunc <16 x i16> %1 to <16 x i8> 1542 ret <16 x i8> %2 1543} 1544 1545define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { 1546; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1547; SSE: # %bb.0: 1548; SSE-NEXT: psubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 1549; SSE-NEXT: retq 1550; 1551; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: 1552; AVX: # %bb.0: 1553; AVX-NEXT: vpsubb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1554; AVX-NEXT: retq 1555 %a = zext <16 x i8> %x to <16 x i16> 1556 %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 1557 %c = trunc <16 x i16> %b to <16 x i8> 1558 ret <16 x i8> %c 1559} 1560 1561define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { 1562; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1563; SSE: # %bb.0: 1564; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1565; SSE-NEXT: psubb %xmm0, %xmm1 1566; SSE-NEXT: movdqa %xmm1, %xmm0 1567; SSE-NEXT: retq 1568; 1569; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: 1570; AVX: # %bb.0: 1571; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] 1572; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 1573; AVX-NEXT: retq 1574 %a = zext <16 x i8> %x to <16 x i16> 1575 %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a 1576 %c = trunc <16 x i16> %b to <16 x i8> 1577 ret <16 x i8> %c 1578} 1579 1580; 1581; mul 1582; 1583 1584define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 1585; SSE-LABEL: trunc_mul_v4i64_v4i32: 1586; SSE: # %bb.0: 1587; SSE-NEXT: pmuludq %xmm3, %xmm1 1588; SSE-NEXT: pmuludq %xmm2, %xmm0 1589; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1590; SSE-NEXT: retq 1591; 1592; AVX1-LABEL: trunc_mul_v4i64_v4i32: 1593; AVX1: # %bb.0: 1594; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 1595; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1596; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 1597; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1598; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1599; AVX1-NEXT: vzeroupper 1600; AVX1-NEXT: retq 1601; 1602; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: 1603; AVX2-SLOW: # %bb.0: 1604; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 1605; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1606; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 1607; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1608; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1609; AVX2-SLOW-NEXT: vzeroupper 1610; AVX2-SLOW-NEXT: retq 1611; 1612; AVX2-FAST-ALL-LABEL: trunc_mul_v4i64_v4i32: 1613; AVX2-FAST-ALL: # %bb.0: 1614; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] 1615; AVX2-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 1616; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm0 1617; AVX2-FAST-ALL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1618; AVX2-FAST-ALL-NEXT: vzeroupper 1619; AVX2-FAST-ALL-NEXT: retq 1620; 1621; AVX2-FAST-PERLANE-LABEL: trunc_mul_v4i64_v4i32: 1622; AVX2-FAST-PERLANE: # %bb.0: 1623; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 1624; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] 1625; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2 1626; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1627; AVX2-FAST-PERLANE-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1628; AVX2-FAST-PERLANE-NEXT: vzeroupper 1629; AVX2-FAST-PERLANE-NEXT: retq 1630; 1631; AVX512F-LABEL: trunc_mul_v4i64_v4i32: 1632; AVX512F: # %bb.0: 1633; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1634; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1635; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 1636; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 1637; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1638; AVX512F-NEXT: vzeroupper 1639; AVX512F-NEXT: retq 1640; 1641; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: 1642; AVX512BW: # %bb.0: 1643; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1644; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1645; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 1646; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 1647; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1648; AVX512BW-NEXT: vzeroupper 1649; AVX512BW-NEXT: retq 1650; 1651; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: 1652; AVX512DQ: # %bb.0: 1653; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 1654; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 1655; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1656; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 1657; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1658; AVX512DQ-NEXT: vzeroupper 1659; AVX512DQ-NEXT: retq 1660 %1 = mul <4 x i64> %a0, %a1 1661 %2 = trunc <4 x i64> %1 to <4 x i32> 1662 ret <4 x i32> %2 1663} 1664 1665define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 1666; SSE-LABEL: trunc_mul_v8i64_v8i16: 1667; SSE: # %bb.0: 1668; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 1669; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] 1670; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1671; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] 1672; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1673; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] 1674; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] 1675; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 1676; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] 1677; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 1678; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] 1679; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1680; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 1681; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1682; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 1683; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1684; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 1685; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 1686; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 1687; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 1688; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1689; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 1690; SSE-NEXT: pmullw %xmm6, %xmm0 1691; SSE-NEXT: retq 1692; 1693; AVX1-LABEL: trunc_mul_v8i64_v8i16: 1694; AVX1: # %bb.0: 1695; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 1696; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 1697; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 1698; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 1699; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1700; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 1701; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 1702; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1703; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1704; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1705; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 1706; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 1707; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1708; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 1709; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1710; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1711; AVX1-NEXT: vzeroupper 1712; AVX1-NEXT: retq 1713; 1714; AVX2-LABEL: trunc_mul_v8i64_v8i16: 1715; AVX2: # %bb.0: 1716; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 1717; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7],ymm3[8],ymm4[9,10,11],ymm3[12],ymm4[13,14,15] 1718; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7],ymm2[8],ymm4[9,10,11],ymm2[12],ymm4[13,14,15] 1719; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1720; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1721; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1722; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2,3],ymm1[4],ymm4[5,6,7],ymm1[8],ymm4[9,10,11],ymm1[12],ymm4[13,14,15] 1723; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7],ymm0[8],ymm4[9,10,11],ymm0[12],ymm4[13,14,15] 1724; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1725; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1726; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1727; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 1728; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1729; AVX2-NEXT: vzeroupper 1730; AVX2-NEXT: retq 1731; 1732; AVX512F-LABEL: trunc_mul_v8i64_v8i16: 1733; AVX512F: # %bb.0: 1734; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 1735; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 1736; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1737; AVX512F-NEXT: vzeroupper 1738; AVX512F-NEXT: retq 1739; 1740; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: 1741; AVX512BW: # %bb.0: 1742; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 1743; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 1744; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1745; AVX512BW-NEXT: vzeroupper 1746; AVX512BW-NEXT: retq 1747; 1748; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: 1749; AVX512DQ: # %bb.0: 1750; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 1751; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 1752; AVX512DQ-NEXT: vzeroupper 1753; AVX512DQ-NEXT: retq 1754 %1 = mul <8 x i64> %a0, %a1 1755 %2 = trunc <8 x i64> %1 to <8 x i16> 1756 ret <8 x i16> %2 1757} 1758 1759define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 1760; SSE-LABEL: trunc_mul_v8i32_v8i16: 1761; SSE: # %bb.0: 1762; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 1763; SSE-NEXT: pmuludq %xmm2, %xmm0 1764; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1765; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 1766; SSE-NEXT: pmuludq %xmm4, %xmm2 1767; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1768; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1769; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 1770; SSE-NEXT: pmuludq %xmm3, %xmm1 1771; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1772; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 1773; SSE-NEXT: pmuludq %xmm2, %xmm3 1774; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 1775; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1776; SSE-NEXT: pslld $16, %xmm1 1777; SSE-NEXT: psrad $16, %xmm1 1778; SSE-NEXT: pslld $16, %xmm0 1779; SSE-NEXT: psrad $16, %xmm0 1780; SSE-NEXT: packssdw %xmm1, %xmm0 1781; SSE-NEXT: retq 1782; 1783; AVX1-LABEL: trunc_mul_v8i32_v8i16: 1784; AVX1: # %bb.0: 1785; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 1786; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1787; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1788; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1789; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 1790; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 1791; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 1792; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] 1793; AVX1-NEXT: vzeroupper 1794; AVX1-NEXT: retq 1795; 1796; AVX2-LABEL: trunc_mul_v8i32_v8i16: 1797; AVX2: # %bb.0: 1798; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1799; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 1800; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 1801; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1802; AVX2-NEXT: vzeroupper 1803; AVX2-NEXT: retq 1804; 1805; AVX512-LABEL: trunc_mul_v8i32_v8i16: 1806; AVX512: # %bb.0: 1807; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 1808; AVX512-NEXT: vpmovdw %zmm0, %ymm0 1809; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 1810; AVX512-NEXT: vzeroupper 1811; AVX512-NEXT: retq 1812 %1 = mul <8 x i32> %a0, %a1 1813 %2 = trunc <8 x i32> %1 to <8 x i16> 1814 ret <8 x i16> %2 1815} 1816 1817define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 1818; SSE-LABEL: trunc_mul_v16i64_v16i8: 1819; SSE: # %bb.0: 1820; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 1821; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 1822; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 1823; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 1824; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 1825; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 1826; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 1827; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 1828; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 1829; SSE-NEXT: pand %xmm8, %xmm7 1830; SSE-NEXT: pand %xmm8, %xmm6 1831; SSE-NEXT: packuswb %xmm7, %xmm6 1832; SSE-NEXT: pand %xmm8, %xmm5 1833; SSE-NEXT: pand %xmm8, %xmm4 1834; SSE-NEXT: packuswb %xmm5, %xmm4 1835; SSE-NEXT: packuswb %xmm6, %xmm4 1836; SSE-NEXT: pand %xmm8, %xmm3 1837; SSE-NEXT: pand %xmm8, %xmm2 1838; SSE-NEXT: packuswb %xmm3, %xmm2 1839; SSE-NEXT: pand %xmm8, %xmm1 1840; SSE-NEXT: pand %xmm8, %xmm0 1841; SSE-NEXT: packuswb %xmm1, %xmm0 1842; SSE-NEXT: packuswb %xmm2, %xmm0 1843; SSE-NEXT: packuswb %xmm4, %xmm0 1844; SSE-NEXT: retq 1845; 1846; AVX1-LABEL: trunc_mul_v16i64_v16i8: 1847; AVX1: # %bb.0: 1848; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 1849; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 1850; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1851; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 1852; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 1853; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 1854; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1855; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 1856; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 1857; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 1858; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1859; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 1860; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 1861; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 1862; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1863; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 1864; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] 1865; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 1866; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 1867; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 1868; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 1869; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 1870; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 1871; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 1872; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 1873; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 1874; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 1875; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 1876; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 1877; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 1878; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 1879; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 1880; AVX1-NEXT: vzeroupper 1881; AVX1-NEXT: retq 1882; 1883; AVX2-LABEL: trunc_mul_v16i64_v16i8: 1884; AVX2: # %bb.0: 1885; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm0 1886; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm1 1887; AVX2-NEXT: vpmuludq %ymm6, %ymm2, %ymm2 1888; AVX2-NEXT: vpmuludq %ymm7, %ymm3, %ymm3 1889; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 1890; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 1891; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 1892; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 1893; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 1894; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 1895; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 1896; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 1897; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1898; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 1899; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1900; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1901; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 1902; AVX2-NEXT: vzeroupper 1903; AVX2-NEXT: retq 1904; 1905; AVX512F-LABEL: trunc_mul_v16i64_v16i8: 1906; AVX512F: # %bb.0: 1907; AVX512F-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1908; AVX512F-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1909; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 1910; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 1911; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1912; AVX512F-NEXT: vzeroupper 1913; AVX512F-NEXT: retq 1914; 1915; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: 1916; AVX512BW: # %bb.0: 1917; AVX512BW-NEXT: vpmuludq %zmm2, %zmm0, %zmm0 1918; AVX512BW-NEXT: vpmuludq %zmm3, %zmm1, %zmm1 1919; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 1920; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 1921; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1922; AVX512BW-NEXT: vzeroupper 1923; AVX512BW-NEXT: retq 1924; 1925; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: 1926; AVX512DQ: # %bb.0: 1927; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 1928; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 1929; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 1930; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 1931; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1932; AVX512DQ-NEXT: vzeroupper 1933; AVX512DQ-NEXT: retq 1934 %1 = mul <16 x i64> %a0, %a1 1935 %2 = trunc <16 x i64> %1 to <16 x i8> 1936 ret <16 x i8> %2 1937} 1938 1939define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 1940; SSE-LABEL: trunc_mul_v16i32_v16i8: 1941; SSE: # %bb.0: 1942; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] 1943; SSE-NEXT: pmuludq %xmm4, %xmm0 1944; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1945; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 1946; SSE-NEXT: pmuludq %xmm8, %xmm4 1947; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1948; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 1949; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 1950; SSE-NEXT: pmuludq %xmm5, %xmm1 1951; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1952; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 1953; SSE-NEXT: pmuludq %xmm4, %xmm5 1954; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1955; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 1956; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 1957; SSE-NEXT: pmuludq %xmm6, %xmm2 1958; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 1959; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] 1960; SSE-NEXT: pmuludq %xmm4, %xmm5 1961; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1962; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 1963; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 1964; SSE-NEXT: pmuludq %xmm7, %xmm3 1965; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 1966; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] 1967; SSE-NEXT: pmuludq %xmm4, %xmm5 1968; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 1969; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1970; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 1971; SSE-NEXT: pand %xmm4, %xmm3 1972; SSE-NEXT: pand %xmm4, %xmm2 1973; SSE-NEXT: packuswb %xmm3, %xmm2 1974; SSE-NEXT: pand %xmm4, %xmm1 1975; SSE-NEXT: pand %xmm4, %xmm0 1976; SSE-NEXT: packuswb %xmm1, %xmm0 1977; SSE-NEXT: packuswb %xmm2, %xmm0 1978; SSE-NEXT: retq 1979; 1980; AVX1-LABEL: trunc_mul_v16i32_v16i8: 1981; AVX1: # %bb.0: 1982; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 1983; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 1984; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1985; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 1986; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 1987; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 1988; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1989; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 1990; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] 1991; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 1992; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 1993; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 1994; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 1995; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 1996; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 1997; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1998; AVX1-NEXT: vzeroupper 1999; AVX1-NEXT: retq 2000; 2001; AVX2-LABEL: trunc_mul_v16i32_v16i8: 2002; AVX2: # %bb.0: 2003; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 2004; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 2005; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2006; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2007; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2008; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2009; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2010; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2011; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2012; AVX2-NEXT: vzeroupper 2013; AVX2-NEXT: retq 2014; 2015; AVX512-LABEL: trunc_mul_v16i32_v16i8: 2016; AVX512: # %bb.0: 2017; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 2018; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2019; AVX512-NEXT: vzeroupper 2020; AVX512-NEXT: retq 2021 %1 = mul <16 x i32> %a0, %a1 2022 %2 = trunc <16 x i32> %1 to <16 x i8> 2023 ret <16 x i8> %2 2024} 2025 2026define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2027; SSE-LABEL: trunc_mul_v16i16_v16i8: 2028; SSE: # %bb.0: 2029; SSE-NEXT: pmullw %xmm2, %xmm0 2030; SSE-NEXT: pmullw %xmm3, %xmm1 2031; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2032; SSE-NEXT: pand %xmm2, %xmm1 2033; SSE-NEXT: pand %xmm2, %xmm0 2034; SSE-NEXT: packuswb %xmm1, %xmm0 2035; SSE-NEXT: retq 2036; 2037; AVX1-LABEL: trunc_mul_v16i16_v16i8: 2038; AVX1: # %bb.0: 2039; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2040; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2041; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2042; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2043; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 2044; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 2045; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 2046; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2047; AVX1-NEXT: vzeroupper 2048; AVX1-NEXT: retq 2049; 2050; AVX2-LABEL: trunc_mul_v16i16_v16i8: 2051; AVX2: # %bb.0: 2052; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2053; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2054; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2055; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2056; AVX2-NEXT: vzeroupper 2057; AVX2-NEXT: retq 2058; 2059; AVX512F-LABEL: trunc_mul_v16i16_v16i8: 2060; AVX512F: # %bb.0: 2061; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2062; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2063; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2064; AVX512F-NEXT: vzeroupper 2065; AVX512F-NEXT: retq 2066; 2067; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: 2068; AVX512BW: # %bb.0: 2069; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2070; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2071; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2072; AVX512BW-NEXT: vzeroupper 2073; AVX512BW-NEXT: retq 2074; 2075; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: 2076; AVX512DQ: # %bb.0: 2077; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 2078; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2079; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2080; AVX512DQ-NEXT: vzeroupper 2081; AVX512DQ-NEXT: retq 2082 %1 = mul <16 x i16> %a0, %a1 2083 %2 = trunc <16 x i16> %1 to <16 x i8> 2084 ret <16 x i8> %2 2085} 2086 2087define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { 2088; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2089; SSE: # %bb.0: 2090; SSE-NEXT: pxor %xmm3, %xmm3 2091; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 2092; SSE-NEXT: pslld $16, %xmm2 2093; SSE-NEXT: psrad $16, %xmm2 2094; SSE-NEXT: pslld $16, %xmm1 2095; SSE-NEXT: psrad $16, %xmm1 2096; SSE-NEXT: packssdw %xmm2, %xmm1 2097; SSE-NEXT: pmullw %xmm1, %xmm0 2098; SSE-NEXT: retq 2099; 2100; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2101; AVX1: # %bb.0: 2102; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 2103; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2104; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 2105; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 2106; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2107; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2108; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2109; AVX1-NEXT: vzeroupper 2110; AVX1-NEXT: retq 2111; 2112; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2113; AVX2: # %bb.0: 2114; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2115; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2116; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] 2117; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2118; AVX2-NEXT: vzeroupper 2119; AVX2-NEXT: retq 2120; 2121; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: 2122; AVX512: # %bb.0: 2123; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 2124; AVX512-NEXT: vpmovdw %zmm1, %ymm1 2125; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2126; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 2127; AVX512-NEXT: vzeroupper 2128; AVX512-NEXT: retq 2129 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2130 %2 = zext <8 x i8> %1 to <8 x i32> 2131 %3 = mul <8 x i32> %2, %a1 2132 %4 = trunc <8 x i32> %3 to <8 x i16> 2133 ret <8 x i16> %4 2134} 2135 2136; 2137; mul to constant 2138; 2139 2140define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2141; SSE-LABEL: trunc_mul_const_v4i64_v4i32: 2142; SSE: # %bb.0: 2143; SSE-NEXT: xorps %xmm2, %xmm2 2144; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2145; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2146; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2] 2147; SSE-NEXT: movaps %xmm2, %xmm0 2148; SSE-NEXT: retq 2149; 2150; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: 2151; AVX1: # %bb.0: 2152; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2153; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2154; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2155; AVX1-NEXT: vzeroupper 2156; AVX1-NEXT: retq 2157; 2158; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: 2159; AVX2-SLOW: # %bb.0: 2160; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2161; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2162; AVX2-SLOW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2163; AVX2-SLOW-NEXT: vzeroupper 2164; AVX2-SLOW-NEXT: retq 2165; 2166; AVX2-FAST-ALL-LABEL: trunc_mul_const_v4i64_v4i32: 2167; AVX2-FAST-ALL: # %bb.0: 2168; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2169; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 2170; AVX2-FAST-ALL-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2171; AVX2-FAST-ALL-NEXT: vzeroupper 2172; AVX2-FAST-ALL-NEXT: retq 2173; 2174; AVX2-FAST-PERLANE-LABEL: trunc_mul_const_v4i64_v4i32: 2175; AVX2-FAST-PERLANE: # %bb.0: 2176; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2177; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2178; AVX2-FAST-PERLANE-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2179; AVX2-FAST-PERLANE-NEXT: vzeroupper 2180; AVX2-FAST-PERLANE-NEXT: retq 2181; 2182; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: 2183; AVX512: # %bb.0: 2184; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2185; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2186; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2187; AVX512-NEXT: vzeroupper 2188; AVX512-NEXT: retq 2189 %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2190 %2 = trunc <4 x i64> %1 to <4 x i32> 2191 ret <4 x i32> %2 2192} 2193 2194define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2195; SSE-LABEL: trunc_mul_const_v8i64_v8i16: 2196; SSE: # %bb.0: 2197; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2198; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2199; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2200; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2201; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2202; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2203; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2204; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2205; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2206; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2207; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2208; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2209; SSE-NEXT: retq 2210; 2211; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: 2212; AVX1: # %bb.0: 2213; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2214; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2215; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2216; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2217; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2218; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2219; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2220; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2221; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2222; AVX1-NEXT: vzeroupper 2223; AVX1-NEXT: retq 2224; 2225; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: 2226; AVX2: # %bb.0: 2227; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2228; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2229; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2230; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2231; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2232; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2233; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2234; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2235; AVX2-NEXT: vzeroupper 2236; AVX2-NEXT: retq 2237; 2238; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: 2239; AVX512: # %bb.0: 2240; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2241; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2242; AVX512-NEXT: vzeroupper 2243; AVX512-NEXT: retq 2244 %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 2245 %2 = trunc <8 x i64> %1 to <8 x i16> 2246 ret <8 x i16> %2 2247} 2248 2249define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 2250; SSE-LABEL: trunc_mul_const_v8i32_v8i16: 2251; SSE: # %bb.0: 2252; SSE-NEXT: pslld $16, %xmm1 2253; SSE-NEXT: psrad $16, %xmm1 2254; SSE-NEXT: pslld $16, %xmm0 2255; SSE-NEXT: psrad $16, %xmm0 2256; SSE-NEXT: packssdw %xmm1, %xmm0 2257; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2258; SSE-NEXT: retq 2259; 2260; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: 2261; AVX1: # %bb.0: 2262; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2263; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2264; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2265; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2266; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2267; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2268; AVX1-NEXT: vzeroupper 2269; AVX1-NEXT: retq 2270; 2271; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: 2272; AVX2: # %bb.0: 2273; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2274; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2275; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2276; AVX2-NEXT: vzeroupper 2277; AVX2-NEXT: retq 2278; 2279; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: 2280; AVX512: # %bb.0: 2281; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2282; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2283; AVX512-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2284; AVX512-NEXT: vzeroupper 2285; AVX512-NEXT: retq 2286 %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2287 %2 = trunc <8 x i32> %1 to <8 x i16> 2288 ret <8 x i16> %2 2289} 2290 2291define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 2292; SSE-LABEL: trunc_mul_const_v16i64_v16i8: 2293; SSE: # %bb.0: 2294; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2295; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2296; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2297; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2298; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5 2299; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6 2300; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7 2301; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2302; SSE-NEXT: pand %xmm8, %xmm7 2303; SSE-NEXT: pand %xmm8, %xmm6 2304; SSE-NEXT: packuswb %xmm7, %xmm6 2305; SSE-NEXT: pand %xmm8, %xmm5 2306; SSE-NEXT: pand %xmm8, %xmm4 2307; SSE-NEXT: packuswb %xmm5, %xmm4 2308; SSE-NEXT: packuswb %xmm6, %xmm4 2309; SSE-NEXT: pand %xmm8, %xmm3 2310; SSE-NEXT: pand %xmm8, %xmm2 2311; SSE-NEXT: packuswb %xmm3, %xmm2 2312; SSE-NEXT: pand %xmm8, %xmm1 2313; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2314; SSE-NEXT: packuswb %xmm1, %xmm0 2315; SSE-NEXT: packuswb %xmm2, %xmm0 2316; SSE-NEXT: packuswb %xmm4, %xmm0 2317; SSE-NEXT: retq 2318; 2319; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: 2320; AVX1: # %bb.0: 2321; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm8 2322; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2323; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2324; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm5 2325; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2326; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2327; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm6 2328; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2329; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 2330; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm7 2331; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2332; AVX1-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 2333; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] 2334; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2335; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 2336; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 2337; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2338; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 2339; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 2340; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2341; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2342; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 2343; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2344; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2345; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 2346; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 2347; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2348; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2349; AVX1-NEXT: vzeroupper 2350; AVX1-NEXT: retq 2351; 2352; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: 2353; AVX2: # %bb.0: 2354; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2355; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2356; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 2357; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 2358; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 2359; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2360; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2361; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2362; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2363; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 2364; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 2365; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2366; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2367; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2368; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2369; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2370; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2371; AVX2-NEXT: vzeroupper 2372; AVX2-NEXT: retq 2373; 2374; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8: 2375; AVX512F: # %bb.0: 2376; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2377; AVX512F-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2378; AVX512F-NEXT: vpmovqb %zmm1, %xmm1 2379; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 2380; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2381; AVX512F-NEXT: vzeroupper 2382; AVX512F-NEXT: retq 2383; 2384; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8: 2385; AVX512BW: # %bb.0: 2386; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2387; AVX512BW-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2388; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1 2389; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 2390; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2391; AVX512BW-NEXT: vzeroupper 2392; AVX512BW-NEXT: retq 2393; 2394; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8: 2395; AVX512DQ: # %bb.0: 2396; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2397; AVX512DQ-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 2398; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1 2399; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0 2400; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2401; AVX512DQ-NEXT: vzeroupper 2402; AVX512DQ-NEXT: retq 2403 %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 2404 %2 = trunc <16 x i64> %1 to <16 x i8> 2405 ret <16 x i8> %2 2406} 2407 2408define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 2409; SSE-LABEL: trunc_mul_const_v16i32_v16i8: 2410; SSE: # %bb.0: 2411; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2412; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2413; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2414; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2415; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2416; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2417; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 2418; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2419; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2420; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2421; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2422; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2423; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 2424; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 2425; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 2426; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2427; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2428; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 2429; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 2430; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 2431; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2432; SSE-NEXT: pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4 2433; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2434; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 2435; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2436; SSE-NEXT: pand %xmm4, %xmm3 2437; SSE-NEXT: pand %xmm4, %xmm2 2438; SSE-NEXT: packuswb %xmm3, %xmm2 2439; SSE-NEXT: pand %xmm4, %xmm1 2440; SSE-NEXT: pand %xmm4, %xmm0 2441; SSE-NEXT: packuswb %xmm1, %xmm0 2442; SSE-NEXT: packuswb %xmm2, %xmm0 2443; SSE-NEXT: retq 2444; 2445; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: 2446; AVX1: # %bb.0: 2447; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 2448; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2449; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2450; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 2451; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2452; AVX1-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 2453; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] 2454; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 2455; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 2456; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 2457; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 2458; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 2459; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 2460; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2461; AVX1-NEXT: vzeroupper 2462; AVX1-NEXT: retq 2463; 2464; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: 2465; AVX2: # %bb.0: 2466; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2467; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 2468; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 2469; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 2470; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2471; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2472; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2473; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2474; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2475; AVX2-NEXT: vzeroupper 2476; AVX2-NEXT: retq 2477; 2478; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: 2479; AVX512: # %bb.0: 2480; AVX512-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 2481; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2482; AVX512-NEXT: vzeroupper 2483; AVX512-NEXT: retq 2484 %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2485 %2 = trunc <16 x i32> %1 to <16 x i8> 2486 ret <16 x i8> %2 2487} 2488 2489define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 2490; SSE-LABEL: trunc_mul_const_v16i16_v16i8: 2491; SSE: # %bb.0: 2492; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2493; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 2494; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2495; SSE-NEXT: pand %xmm2, %xmm1 2496; SSE-NEXT: pand %xmm2, %xmm0 2497; SSE-NEXT: packuswb %xmm1, %xmm0 2498; SSE-NEXT: retq 2499; 2500; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: 2501; AVX1: # %bb.0: 2502; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 2503; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2504; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2505; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 2506; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 2507; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 2508; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 2509; AVX1-NEXT: vzeroupper 2510; AVX1-NEXT: retq 2511; 2512; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: 2513; AVX2: # %bb.0: 2514; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2515; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2516; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2517; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2518; AVX2-NEXT: vzeroupper 2519; AVX2-NEXT: retq 2520; 2521; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: 2522; AVX512F: # %bb.0: 2523; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2524; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2525; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2526; AVX512F-NEXT: vzeroupper 2527; AVX512F-NEXT: retq 2528; 2529; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: 2530; AVX512BW: # %bb.0: 2531; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2532; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2533; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2534; AVX512BW-NEXT: vzeroupper 2535; AVX512BW-NEXT: retq 2536; 2537; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: 2538; AVX512DQ: # %bb.0: 2539; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2540; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2541; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2542; AVX512DQ-NEXT: vzeroupper 2543; AVX512DQ-NEXT: retq 2544 %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 2545 %2 = trunc <16 x i16> %1 to <16 x i8> 2546 ret <16 x i8> %2 2547} 2548 2549; 2550; and 2551; 2552 2553define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 2554; SSE-LABEL: trunc_and_v4i64_v4i32: 2555; SSE: # %bb.0: 2556; SSE-NEXT: andps %xmm3, %xmm1 2557; SSE-NEXT: andps %xmm2, %xmm0 2558; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2559; SSE-NEXT: retq 2560; 2561; AVX1-LABEL: trunc_and_v4i64_v4i32: 2562; AVX1: # %bb.0: 2563; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2564; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2565; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2566; AVX1-NEXT: vzeroupper 2567; AVX1-NEXT: retq 2568; 2569; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: 2570; AVX2-SLOW: # %bb.0: 2571; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 2572; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2573; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2574; AVX2-SLOW-NEXT: vzeroupper 2575; AVX2-SLOW-NEXT: retq 2576; 2577; AVX2-FAST-ALL-LABEL: trunc_and_v4i64_v4i32: 2578; AVX2-FAST-ALL: # %bb.0: 2579; AVX2-FAST-ALL-NEXT: vandps %ymm1, %ymm0, %ymm0 2580; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 2581; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2582; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2583; AVX2-FAST-ALL-NEXT: vzeroupper 2584; AVX2-FAST-ALL-NEXT: retq 2585; 2586; AVX2-FAST-PERLANE-LABEL: trunc_and_v4i64_v4i32: 2587; AVX2-FAST-PERLANE: # %bb.0: 2588; AVX2-FAST-PERLANE-NEXT: vandps %ymm1, %ymm0, %ymm0 2589; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2590; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2591; AVX2-FAST-PERLANE-NEXT: vzeroupper 2592; AVX2-FAST-PERLANE-NEXT: retq 2593; 2594; AVX512-LABEL: trunc_and_v4i64_v4i32: 2595; AVX512: # %bb.0: 2596; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2597; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2598; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2599; AVX512-NEXT: vzeroupper 2600; AVX512-NEXT: retq 2601 %1 = and <4 x i64> %a0, %a1 2602 %2 = trunc <4 x i64> %1 to <4 x i32> 2603 ret <4 x i32> %2 2604} 2605 2606define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 2607; SSE-LABEL: trunc_and_v8i64_v8i16: 2608; SSE: # %bb.0: 2609; SSE-NEXT: pand %xmm6, %xmm2 2610; SSE-NEXT: pand %xmm7, %xmm3 2611; SSE-NEXT: pand %xmm4, %xmm0 2612; SSE-NEXT: pand %xmm5, %xmm1 2613; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2614; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2615; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2616; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2617; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2618; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2619; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2620; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2621; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2622; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2623; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2624; SSE-NEXT: retq 2625; 2626; AVX1-LABEL: trunc_and_v8i64_v8i16: 2627; AVX1: # %bb.0: 2628; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] 2629; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2630; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2631; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2632; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2633; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2634; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2635; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2636; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2637; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2638; AVX1-NEXT: vzeroupper 2639; AVX1-NEXT: retq 2640; 2641; AVX2-LABEL: trunc_and_v8i64_v8i16: 2642; AVX2: # %bb.0: 2643; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2644; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2645; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 2646; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 2647; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 2648; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2649; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2650; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2651; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2652; AVX2-NEXT: vzeroupper 2653; AVX2-NEXT: retq 2654; 2655; AVX512-LABEL: trunc_and_v8i64_v8i16: 2656; AVX512: # %bb.0: 2657; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 2658; AVX512-NEXT: vpmovqw %zmm0, %xmm0 2659; AVX512-NEXT: vzeroupper 2660; AVX512-NEXT: retq 2661 %1 = and <8 x i64> %a0, %a1 2662 %2 = trunc <8 x i64> %1 to <8 x i16> 2663 ret <8 x i16> %2 2664} 2665 2666define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 2667; SSE-LABEL: trunc_and_v8i32_v8i16: 2668; SSE: # %bb.0: 2669; SSE-NEXT: pand %xmm2, %xmm0 2670; SSE-NEXT: pand %xmm3, %xmm1 2671; SSE-NEXT: pslld $16, %xmm1 2672; SSE-NEXT: psrad $16, %xmm1 2673; SSE-NEXT: pslld $16, %xmm0 2674; SSE-NEXT: psrad $16, %xmm0 2675; SSE-NEXT: packssdw %xmm1, %xmm0 2676; SSE-NEXT: retq 2677; 2678; AVX1-LABEL: trunc_and_v8i32_v8i16: 2679; AVX1: # %bb.0: 2680; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2681; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2682; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 2683; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 2684; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 2685; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2686; AVX1-NEXT: vzeroupper 2687; AVX1-NEXT: retq 2688; 2689; AVX2-LABEL: trunc_and_v8i32_v8i16: 2690; AVX2: # %bb.0: 2691; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2692; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 2693; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 2694; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2695; AVX2-NEXT: vzeroupper 2696; AVX2-NEXT: retq 2697; 2698; AVX512-LABEL: trunc_and_v8i32_v8i16: 2699; AVX512: # %bb.0: 2700; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 2701; AVX512-NEXT: vpmovdw %zmm0, %ymm0 2702; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2703; AVX512-NEXT: vzeroupper 2704; AVX512-NEXT: retq 2705 %1 = and <8 x i32> %a0, %a1 2706 %2 = trunc <8 x i32> %1 to <8 x i16> 2707 ret <8 x i16> %2 2708} 2709 2710define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 2711; SSE-LABEL: trunc_and_v16i64_v16i8: 2712; SSE: # %bb.0: 2713; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 2714; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 2715; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 2716; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 2717; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 2718; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 2719; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 2720; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 2721; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 2722; SSE-NEXT: pand %xmm8, %xmm7 2723; SSE-NEXT: pand %xmm8, %xmm6 2724; SSE-NEXT: packuswb %xmm7, %xmm6 2725; SSE-NEXT: pand %xmm8, %xmm5 2726; SSE-NEXT: pand %xmm8, %xmm4 2727; SSE-NEXT: packuswb %xmm5, %xmm4 2728; SSE-NEXT: packuswb %xmm6, %xmm4 2729; SSE-NEXT: pand %xmm8, %xmm3 2730; SSE-NEXT: pand %xmm8, %xmm2 2731; SSE-NEXT: packuswb %xmm3, %xmm2 2732; SSE-NEXT: pand %xmm8, %xmm1 2733; SSE-NEXT: pand %xmm8, %xmm0 2734; SSE-NEXT: packuswb %xmm1, %xmm0 2735; SSE-NEXT: packuswb %xmm2, %xmm0 2736; SSE-NEXT: packuswb %xmm4, %xmm0 2737; SSE-NEXT: retq 2738; 2739; AVX1-LABEL: trunc_and_v16i64_v16i8: 2740; AVX1: # %bb.0: 2741; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] 2742; AVX1-NEXT: vandps %ymm7, %ymm8, %ymm7 2743; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 2744; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 2745; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 2746; AVX1-NEXT: vandps %ymm6, %ymm8, %ymm6 2747; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 2748; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 2749; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 2750; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 2751; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm3 2752; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2753; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2754; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2755; AVX1-NEXT: vandps %ymm4, %ymm8, %ymm3 2756; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 2757; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 2758; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 2759; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2760; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 2761; AVX1-NEXT: vzeroupper 2762; AVX1-NEXT: retq 2763; 2764; AVX2-LABEL: trunc_and_v16i64_v16i8: 2765; AVX2: # %bb.0: 2766; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [255,255,255,255] 2767; AVX2-NEXT: vpand %ymm7, %ymm8, %ymm7 2768; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 2769; AVX2-NEXT: vpand %ymm6, %ymm8, %ymm6 2770; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 2771; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 2772; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 2773; AVX2-NEXT: vpand %ymm5, %ymm8, %ymm3 2774; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2775; AVX2-NEXT: vpand %ymm4, %ymm8, %ymm3 2776; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 2777; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2778; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 2779; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 2780; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2781; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2782; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2783; AVX2-NEXT: vzeroupper 2784; AVX2-NEXT: retq 2785; 2786; AVX512-LABEL: trunc_and_v16i64_v16i8: 2787; AVX512: # %bb.0: 2788; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 2789; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 2790; AVX512-NEXT: vpmovqb %zmm1, %xmm1 2791; AVX512-NEXT: vpmovqb %zmm0, %xmm0 2792; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 2793; AVX512-NEXT: vzeroupper 2794; AVX512-NEXT: retq 2795 %1 = and <16 x i64> %a0, %a1 2796 %2 = trunc <16 x i64> %1 to <16 x i8> 2797 ret <16 x i8> %2 2798} 2799 2800define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 2801; SSE-LABEL: trunc_and_v16i32_v16i8: 2802; SSE: # %bb.0: 2803; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 2804; SSE-NEXT: pand %xmm8, %xmm7 2805; SSE-NEXT: pand %xmm3, %xmm7 2806; SSE-NEXT: pand %xmm8, %xmm6 2807; SSE-NEXT: pand %xmm2, %xmm6 2808; SSE-NEXT: packuswb %xmm7, %xmm6 2809; SSE-NEXT: pand %xmm8, %xmm5 2810; SSE-NEXT: pand %xmm1, %xmm5 2811; SSE-NEXT: pand %xmm4, %xmm8 2812; SSE-NEXT: pand %xmm8, %xmm0 2813; SSE-NEXT: packuswb %xmm5, %xmm0 2814; SSE-NEXT: packuswb %xmm6, %xmm0 2815; SSE-NEXT: retq 2816; 2817; AVX1-LABEL: trunc_and_v16i32_v16i8: 2818; AVX1: # %bb.0: 2819; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2820; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 2821; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 2822; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2823; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2824; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 2825; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2826; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2827; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2828; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2829; AVX1-NEXT: vzeroupper 2830; AVX1-NEXT: retq 2831; 2832; AVX2-LABEL: trunc_and_v16i32_v16i8: 2833; AVX2: # %bb.0: 2834; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] 2835; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 2836; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 2837; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 2838; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 2839; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 2840; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2841; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2842; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 2843; AVX2-NEXT: vzeroupper 2844; AVX2-NEXT: retq 2845; 2846; AVX512-LABEL: trunc_and_v16i32_v16i8: 2847; AVX512: # %bb.0: 2848; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 2849; AVX512-NEXT: vpmovdb %zmm0, %xmm0 2850; AVX512-NEXT: vzeroupper 2851; AVX512-NEXT: retq 2852 %1 = and <16 x i32> %a0, %a1 2853 %2 = trunc <16 x i32> %1 to <16 x i8> 2854 ret <16 x i8> %2 2855} 2856 2857define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 2858; SSE-LABEL: trunc_and_v16i16_v16i8: 2859; SSE: # %bb.0: 2860; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 2861; SSE-NEXT: pand %xmm4, %xmm3 2862; SSE-NEXT: pand %xmm1, %xmm3 2863; SSE-NEXT: pand %xmm2, %xmm4 2864; SSE-NEXT: pand %xmm4, %xmm0 2865; SSE-NEXT: packuswb %xmm3, %xmm0 2866; SSE-NEXT: retq 2867; 2868; AVX1-LABEL: trunc_and_v16i16_v16i8: 2869; AVX1: # %bb.0: 2870; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 2871; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2872; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2873; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2874; AVX1-NEXT: vzeroupper 2875; AVX1-NEXT: retq 2876; 2877; AVX2-LABEL: trunc_and_v16i16_v16i8: 2878; AVX2: # %bb.0: 2879; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 2880; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 2881; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2882; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 2883; AVX2-NEXT: vzeroupper 2884; AVX2-NEXT: retq 2885; 2886; AVX512F-LABEL: trunc_and_v16i16_v16i8: 2887; AVX512F: # %bb.0: 2888; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 2889; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2890; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 2891; AVX512F-NEXT: vzeroupper 2892; AVX512F-NEXT: retq 2893; 2894; AVX512BW-LABEL: trunc_and_v16i16_v16i8: 2895; AVX512BW: # %bb.0: 2896; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 2897; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 2898; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 2899; AVX512BW-NEXT: vzeroupper 2900; AVX512BW-NEXT: retq 2901; 2902; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: 2903; AVX512DQ: # %bb.0: 2904; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 2905; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2906; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 2907; AVX512DQ-NEXT: vzeroupper 2908; AVX512DQ-NEXT: retq 2909 %1 = and <16 x i16> %a0, %a1 2910 %2 = trunc <16 x i16> %1 to <16 x i8> 2911 ret <16 x i8> %2 2912} 2913 2914; 2915; and to constant 2916; 2917 2918define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 2919; SSE-LABEL: trunc_and_const_v4i64_v4i32: 2920; SSE: # %bb.0: 2921; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2922; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2923; SSE-NEXT: retq 2924; 2925; AVX1-LABEL: trunc_and_const_v4i64_v4i32: 2926; AVX1: # %bb.0: 2927; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 2928; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2929; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2930; AVX1-NEXT: vzeroupper 2931; AVX1-NEXT: retq 2932; 2933; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: 2934; AVX2-SLOW: # %bb.0: 2935; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 2936; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2937; AVX2-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2938; AVX2-SLOW-NEXT: vzeroupper 2939; AVX2-SLOW-NEXT: retq 2940; 2941; AVX2-FAST-ALL-LABEL: trunc_and_const_v4i64_v4i32: 2942; AVX2-FAST-ALL: # %bb.0: 2943; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <u,2,4,6,u,u,u,u> 2944; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 2945; AVX2-FAST-ALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2946; AVX2-FAST-ALL-NEXT: vzeroupper 2947; AVX2-FAST-ALL-NEXT: retq 2948; 2949; AVX2-FAST-PERLANE-LABEL: trunc_and_const_v4i64_v4i32: 2950; AVX2-FAST-PERLANE: # %bb.0: 2951; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 2952; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2953; AVX2-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2954; AVX2-FAST-PERLANE-NEXT: vzeroupper 2955; AVX2-FAST-PERLANE-NEXT: retq 2956; 2957; AVX512-LABEL: trunc_and_const_v4i64_v4i32: 2958; AVX512: # %bb.0: 2959; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 2960; AVX512-NEXT: vpmovqd %zmm0, %ymm0 2961; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2962; AVX512-NEXT: vzeroupper 2963; AVX512-NEXT: retq 2964 %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 2965 %2 = trunc <4 x i64> %1 to <4 x i32> 2966 ret <4 x i32> %2 2967} 2968 2969define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 2970; SSE-LABEL: trunc_and_const_v8i64_v8i16: 2971; SSE: # %bb.0: 2972; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 2973; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 2974; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2975; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 2976; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 2977; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 2978; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 2979; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2980; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 2981; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2982; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 2983; SSE-NEXT: andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 2984; SSE-NEXT: retq 2985; 2986; AVX1-LABEL: trunc_and_const_v8i64_v8i16: 2987; AVX1: # %bb.0: 2988; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 2989; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 2990; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 2991; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 2992; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 2993; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 2994; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 2995; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 2996; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 2997; AVX1-NEXT: vzeroupper 2998; AVX1-NEXT: retq 2999; 3000; AVX2-LABEL: trunc_and_const_v8i64_v8i16: 3001; AVX2: # %bb.0: 3002; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3003; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3004; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3005; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3006; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3007; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3008; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3009; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3010; AVX2-NEXT: vzeroupper 3011; AVX2-NEXT: retq 3012; 3013; AVX512-LABEL: trunc_and_const_v8i64_v8i16: 3014; AVX512: # %bb.0: 3015; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3016; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3017; AVX512-NEXT: vzeroupper 3018; AVX512-NEXT: retq 3019 %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3020 %2 = trunc <8 x i64> %1 to <8 x i16> 3021 ret <8 x i16> %2 3022} 3023 3024define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3025; SSE-LABEL: trunc_and_const_v8i32_v8i16: 3026; SSE: # %bb.0: 3027; SSE-NEXT: pslld $16, %xmm1 3028; SSE-NEXT: psrad $16, %xmm1 3029; SSE-NEXT: pslld $16, %xmm0 3030; SSE-NEXT: psrad $16, %xmm0 3031; SSE-NEXT: packssdw %xmm1, %xmm0 3032; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3033; SSE-NEXT: retq 3034; 3035; AVX1-LABEL: trunc_and_const_v8i32_v8i16: 3036; AVX1: # %bb.0: 3037; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3038; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3039; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3040; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3041; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3042; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3043; AVX1-NEXT: vzeroupper 3044; AVX1-NEXT: retq 3045; 3046; AVX2-LABEL: trunc_and_const_v8i32_v8i16: 3047; AVX2: # %bb.0: 3048; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3049; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3050; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3051; AVX2-NEXT: vzeroupper 3052; AVX2-NEXT: retq 3053; 3054; AVX512-LABEL: trunc_and_const_v8i32_v8i16: 3055; AVX512: # %bb.0: 3056; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3057; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3058; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3059; AVX512-NEXT: vzeroupper 3060; AVX512-NEXT: retq 3061 %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3062 %2 = trunc <8 x i32> %1 to <8 x i16> 3063 ret <8 x i16> %2 3064} 3065 3066define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3067; SSE-LABEL: trunc_and_const_v16i64_v16i8: 3068; SSE: # %bb.0: 3069; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3070; SSE-NEXT: pand %xmm8, %xmm7 3071; SSE-NEXT: pand %xmm8, %xmm6 3072; SSE-NEXT: packuswb %xmm7, %xmm6 3073; SSE-NEXT: pand %xmm8, %xmm5 3074; SSE-NEXT: pand %xmm8, %xmm4 3075; SSE-NEXT: packuswb %xmm5, %xmm4 3076; SSE-NEXT: packuswb %xmm6, %xmm4 3077; SSE-NEXT: pand %xmm8, %xmm3 3078; SSE-NEXT: pand %xmm8, %xmm2 3079; SSE-NEXT: packuswb %xmm3, %xmm2 3080; SSE-NEXT: pand %xmm8, %xmm1 3081; SSE-NEXT: pand %xmm8, %xmm0 3082; SSE-NEXT: packuswb %xmm1, %xmm0 3083; SSE-NEXT: packuswb %xmm2, %xmm0 3084; SSE-NEXT: packuswb %xmm4, %xmm0 3085; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3086; SSE-NEXT: retq 3087; 3088; AVX1-LABEL: trunc_and_const_v16i64_v16i8: 3089; AVX1: # %bb.0: 3090; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3091; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3092; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3093; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3094; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3095; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3096; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3097; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3098; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3099; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3100; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3101; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3102; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3103; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3104; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3105; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3106; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3107; AVX1-NEXT: vzeroupper 3108; AVX1-NEXT: retq 3109; 3110; AVX2-LABEL: trunc_and_const_v16i64_v16i8: 3111; AVX2: # %bb.0: 3112; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3113; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3114; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3115; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3116; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3117; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3118; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3119; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3120; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3121; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3122; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3123; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3124; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3125; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3126; AVX2-NEXT: vzeroupper 3127; AVX2-NEXT: retq 3128; 3129; AVX512-LABEL: trunc_and_const_v16i64_v16i8: 3130; AVX512: # %bb.0: 3131; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3132; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3133; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3134; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3135; AVX512-NEXT: vzeroupper 3136; AVX512-NEXT: retq 3137 %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3138 %2 = trunc <16 x i64> %1 to <16 x i8> 3139 ret <16 x i8> %2 3140} 3141 3142define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3143; SSE-LABEL: trunc_and_const_v16i32_v16i8: 3144; SSE: # %bb.0: 3145; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3146; SSE-NEXT: pand %xmm4, %xmm3 3147; SSE-NEXT: pand %xmm4, %xmm2 3148; SSE-NEXT: packuswb %xmm3, %xmm2 3149; SSE-NEXT: pand %xmm4, %xmm1 3150; SSE-NEXT: pand %xmm4, %xmm0 3151; SSE-NEXT: packuswb %xmm1, %xmm0 3152; SSE-NEXT: packuswb %xmm2, %xmm0 3153; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3154; SSE-NEXT: retq 3155; 3156; AVX1-LABEL: trunc_and_const_v16i32_v16i8: 3157; AVX1: # %bb.0: 3158; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3159; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3160; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3161; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3162; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3163; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3164; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3165; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3166; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3167; AVX1-NEXT: vzeroupper 3168; AVX1-NEXT: retq 3169; 3170; AVX2-LABEL: trunc_and_const_v16i32_v16i8: 3171; AVX2: # %bb.0: 3172; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3173; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3174; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3175; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3176; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3177; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3178; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3179; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3180; AVX2-NEXT: vzeroupper 3181; AVX2-NEXT: retq 3182; 3183; AVX512-LABEL: trunc_and_const_v16i32_v16i8: 3184; AVX512: # %bb.0: 3185; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3186; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3187; AVX512-NEXT: vzeroupper 3188; AVX512-NEXT: retq 3189 %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3190 %2 = trunc <16 x i32> %1 to <16 x i8> 3191 ret <16 x i8> %2 3192} 3193 3194define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3195; SSE-LABEL: trunc_and_const_v16i16_v16i8: 3196; SSE: # %bb.0: 3197; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3198; SSE-NEXT: pand %xmm2, %xmm1 3199; SSE-NEXT: pand %xmm2, %xmm0 3200; SSE-NEXT: packuswb %xmm1, %xmm0 3201; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3202; SSE-NEXT: retq 3203; 3204; AVX1-LABEL: trunc_and_const_v16i16_v16i8: 3205; AVX1: # %bb.0: 3206; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3207; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3208; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3209; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3210; AVX1-NEXT: vzeroupper 3211; AVX1-NEXT: retq 3212; 3213; AVX2-LABEL: trunc_and_const_v16i16_v16i8: 3214; AVX2: # %bb.0: 3215; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3216; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3217; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3218; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3219; AVX2-NEXT: vzeroupper 3220; AVX2-NEXT: retq 3221; 3222; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: 3223; AVX512F: # %bb.0: 3224; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3225; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3226; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3227; AVX512F-NEXT: vzeroupper 3228; AVX512F-NEXT: retq 3229; 3230; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: 3231; AVX512BW: # %bb.0: 3232; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3233; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3234; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3235; AVX512BW-NEXT: vzeroupper 3236; AVX512BW-NEXT: retq 3237; 3238; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: 3239; AVX512DQ: # %bb.0: 3240; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3241; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3242; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3243; AVX512DQ-NEXT: vzeroupper 3244; AVX512DQ-NEXT: retq 3245 %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3246 %2 = trunc <16 x i16> %1 to <16 x i8> 3247 ret <16 x i8> %2 3248} 3249 3250; 3251; xor 3252; 3253 3254define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3255; SSE-LABEL: trunc_xor_v4i64_v4i32: 3256; SSE: # %bb.0: 3257; SSE-NEXT: xorps %xmm3, %xmm1 3258; SSE-NEXT: xorps %xmm2, %xmm0 3259; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3260; SSE-NEXT: retq 3261; 3262; AVX1-LABEL: trunc_xor_v4i64_v4i32: 3263; AVX1: # %bb.0: 3264; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3265; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3266; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3267; AVX1-NEXT: vzeroupper 3268; AVX1-NEXT: retq 3269; 3270; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: 3271; AVX2-SLOW: # %bb.0: 3272; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 3273; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3274; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3275; AVX2-SLOW-NEXT: vzeroupper 3276; AVX2-SLOW-NEXT: retq 3277; 3278; AVX2-FAST-ALL-LABEL: trunc_xor_v4i64_v4i32: 3279; AVX2-FAST-ALL: # %bb.0: 3280; AVX2-FAST-ALL-NEXT: vxorps %ymm1, %ymm0, %ymm0 3281; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3282; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3283; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3284; AVX2-FAST-ALL-NEXT: vzeroupper 3285; AVX2-FAST-ALL-NEXT: retq 3286; 3287; AVX2-FAST-PERLANE-LABEL: trunc_xor_v4i64_v4i32: 3288; AVX2-FAST-PERLANE: # %bb.0: 3289; AVX2-FAST-PERLANE-NEXT: vxorps %ymm1, %ymm0, %ymm0 3290; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3291; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3292; AVX2-FAST-PERLANE-NEXT: vzeroupper 3293; AVX2-FAST-PERLANE-NEXT: retq 3294; 3295; AVX512-LABEL: trunc_xor_v4i64_v4i32: 3296; AVX512: # %bb.0: 3297; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3298; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3299; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3300; AVX512-NEXT: vzeroupper 3301; AVX512-NEXT: retq 3302 %1 = xor <4 x i64> %a0, %a1 3303 %2 = trunc <4 x i64> %1 to <4 x i32> 3304 ret <4 x i32> %2 3305} 3306 3307define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 3308; SSE-LABEL: trunc_xor_v8i64_v8i16: 3309; SSE: # %bb.0: 3310; SSE-NEXT: pxor %xmm6, %xmm2 3311; SSE-NEXT: pxor %xmm7, %xmm3 3312; SSE-NEXT: pxor %xmm4, %xmm0 3313; SSE-NEXT: pxor %xmm5, %xmm1 3314; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3315; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3316; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3317; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3318; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3319; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3320; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3321; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3322; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3323; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3324; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3325; SSE-NEXT: retq 3326; 3327; AVX1-LABEL: trunc_xor_v8i64_v8i16: 3328; AVX1: # %bb.0: 3329; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3330; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3331; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3332; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3333; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3334; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3335; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3336; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3337; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3338; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3339; AVX1-NEXT: vzeroupper 3340; AVX1-NEXT: retq 3341; 3342; AVX2-LABEL: trunc_xor_v8i64_v8i16: 3343; AVX2: # %bb.0: 3344; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3345; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3346; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3347; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3348; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3349; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3350; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3351; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3352; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3353; AVX2-NEXT: vzeroupper 3354; AVX2-NEXT: retq 3355; 3356; AVX512-LABEL: trunc_xor_v8i64_v8i16: 3357; AVX512: # %bb.0: 3358; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 3359; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3360; AVX512-NEXT: vzeroupper 3361; AVX512-NEXT: retq 3362 %1 = xor <8 x i64> %a0, %a1 3363 %2 = trunc <8 x i64> %1 to <8 x i16> 3364 ret <8 x i16> %2 3365} 3366 3367define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 3368; SSE-LABEL: trunc_xor_v8i32_v8i16: 3369; SSE: # %bb.0: 3370; SSE-NEXT: pxor %xmm2, %xmm0 3371; SSE-NEXT: pxor %xmm3, %xmm1 3372; SSE-NEXT: pslld $16, %xmm1 3373; SSE-NEXT: psrad $16, %xmm1 3374; SSE-NEXT: pslld $16, %xmm0 3375; SSE-NEXT: psrad $16, %xmm0 3376; SSE-NEXT: packssdw %xmm1, %xmm0 3377; SSE-NEXT: retq 3378; 3379; AVX1-LABEL: trunc_xor_v8i32_v8i16: 3380; AVX1: # %bb.0: 3381; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3382; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3383; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3384; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3385; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3386; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3387; AVX1-NEXT: vzeroupper 3388; AVX1-NEXT: retq 3389; 3390; AVX2-LABEL: trunc_xor_v8i32_v8i16: 3391; AVX2: # %bb.0: 3392; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3393; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3394; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3395; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3396; AVX2-NEXT: vzeroupper 3397; AVX2-NEXT: retq 3398; 3399; AVX512-LABEL: trunc_xor_v8i32_v8i16: 3400; AVX512: # %bb.0: 3401; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 3402; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3403; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3404; AVX512-NEXT: vzeroupper 3405; AVX512-NEXT: retq 3406 %1 = xor <8 x i32> %a0, %a1 3407 %2 = trunc <8 x i32> %1 to <8 x i16> 3408 ret <8 x i16> %2 3409} 3410 3411define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 3412; SSE-LABEL: trunc_xor_v16i64_v16i8: 3413; SSE: # %bb.0: 3414; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 3415; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 3416; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 3417; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 3418; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 3419; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 3420; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 3421; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 3422; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3423; SSE-NEXT: pand %xmm8, %xmm7 3424; SSE-NEXT: pand %xmm8, %xmm6 3425; SSE-NEXT: packuswb %xmm7, %xmm6 3426; SSE-NEXT: pand %xmm8, %xmm5 3427; SSE-NEXT: pand %xmm8, %xmm4 3428; SSE-NEXT: packuswb %xmm5, %xmm4 3429; SSE-NEXT: packuswb %xmm6, %xmm4 3430; SSE-NEXT: pand %xmm8, %xmm3 3431; SSE-NEXT: pand %xmm8, %xmm2 3432; SSE-NEXT: packuswb %xmm3, %xmm2 3433; SSE-NEXT: pand %xmm8, %xmm1 3434; SSE-NEXT: pand %xmm8, %xmm0 3435; SSE-NEXT: packuswb %xmm1, %xmm0 3436; SSE-NEXT: packuswb %xmm2, %xmm0 3437; SSE-NEXT: packuswb %xmm4, %xmm0 3438; SSE-NEXT: retq 3439; 3440; AVX1-LABEL: trunc_xor_v16i64_v16i8: 3441; AVX1: # %bb.0: 3442; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 3443; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 3444; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 3445; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 3446; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3447; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3448; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3449; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3450; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3451; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3452; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3453; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3454; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3455; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3456; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3457; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3458; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3459; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3460; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3461; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3462; AVX1-NEXT: vzeroupper 3463; AVX1-NEXT: retq 3464; 3465; AVX2-LABEL: trunc_xor_v16i64_v16i8: 3466; AVX2: # %bb.0: 3467; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 3468; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 3469; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 3470; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 3471; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3472; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3473; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3474; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3475; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3476; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3477; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3478; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3479; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3480; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3481; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3482; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3483; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3484; AVX2-NEXT: vzeroupper 3485; AVX2-NEXT: retq 3486; 3487; AVX512-LABEL: trunc_xor_v16i64_v16i8: 3488; AVX512: # %bb.0: 3489; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 3490; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 3491; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3492; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3493; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3494; AVX512-NEXT: vzeroupper 3495; AVX512-NEXT: retq 3496 %1 = xor <16 x i64> %a0, %a1 3497 %2 = trunc <16 x i64> %1 to <16 x i8> 3498 ret <16 x i8> %2 3499} 3500 3501define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 3502; SSE-LABEL: trunc_xor_v16i32_v16i8: 3503; SSE: # %bb.0: 3504; SSE-NEXT: pxor %xmm4, %xmm0 3505; SSE-NEXT: pxor %xmm5, %xmm1 3506; SSE-NEXT: pxor %xmm6, %xmm2 3507; SSE-NEXT: pxor %xmm7, %xmm3 3508; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3509; SSE-NEXT: pand %xmm4, %xmm3 3510; SSE-NEXT: pand %xmm4, %xmm2 3511; SSE-NEXT: packuswb %xmm3, %xmm2 3512; SSE-NEXT: pand %xmm4, %xmm1 3513; SSE-NEXT: pand %xmm4, %xmm0 3514; SSE-NEXT: packuswb %xmm1, %xmm0 3515; SSE-NEXT: packuswb %xmm2, %xmm0 3516; SSE-NEXT: retq 3517; 3518; AVX1-LABEL: trunc_xor_v16i32_v16i8: 3519; AVX1: # %bb.0: 3520; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 3521; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 3522; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3523; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3524; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3525; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3526; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3527; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3528; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3529; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3530; AVX1-NEXT: vzeroupper 3531; AVX1-NEXT: retq 3532; 3533; AVX2-LABEL: trunc_xor_v16i32_v16i8: 3534; AVX2: # %bb.0: 3535; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 3536; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 3537; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3538; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3539; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3540; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3541; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3542; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3543; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3544; AVX2-NEXT: vzeroupper 3545; AVX2-NEXT: retq 3546; 3547; AVX512-LABEL: trunc_xor_v16i32_v16i8: 3548; AVX512: # %bb.0: 3549; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 3550; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3551; AVX512-NEXT: vzeroupper 3552; AVX512-NEXT: retq 3553 %1 = xor <16 x i32> %a0, %a1 3554 %2 = trunc <16 x i32> %1 to <16 x i8> 3555 ret <16 x i8> %2 3556} 3557 3558define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 3559; SSE-LABEL: trunc_xor_v16i16_v16i8: 3560; SSE: # %bb.0: 3561; SSE-NEXT: pxor %xmm2, %xmm0 3562; SSE-NEXT: pxor %xmm3, %xmm1 3563; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3564; SSE-NEXT: pand %xmm2, %xmm1 3565; SSE-NEXT: pand %xmm2, %xmm0 3566; SSE-NEXT: packuswb %xmm1, %xmm0 3567; SSE-NEXT: retq 3568; 3569; AVX1-LABEL: trunc_xor_v16i16_v16i8: 3570; AVX1: # %bb.0: 3571; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 3572; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3573; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3574; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3575; AVX1-NEXT: vzeroupper 3576; AVX1-NEXT: retq 3577; 3578; AVX2-LABEL: trunc_xor_v16i16_v16i8: 3579; AVX2: # %bb.0: 3580; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 3581; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3582; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3583; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3584; AVX2-NEXT: vzeroupper 3585; AVX2-NEXT: retq 3586; 3587; AVX512F-LABEL: trunc_xor_v16i16_v16i8: 3588; AVX512F: # %bb.0: 3589; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 3590; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3591; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3592; AVX512F-NEXT: vzeroupper 3593; AVX512F-NEXT: retq 3594; 3595; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: 3596; AVX512BW: # %bb.0: 3597; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 3598; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3599; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3600; AVX512BW-NEXT: vzeroupper 3601; AVX512BW-NEXT: retq 3602; 3603; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: 3604; AVX512DQ: # %bb.0: 3605; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 3606; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3607; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3608; AVX512DQ-NEXT: vzeroupper 3609; AVX512DQ-NEXT: retq 3610 %1 = xor <16 x i16> %a0, %a1 3611 %2 = trunc <16 x i16> %1 to <16 x i8> 3612 ret <16 x i8> %2 3613} 3614 3615; 3616; xor to constant 3617; 3618 3619define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 3620; SSE-LABEL: trunc_xor_const_v4i64_v4i32: 3621; SSE: # %bb.0: 3622; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3623; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3624; SSE-NEXT: retq 3625; 3626; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: 3627; AVX1: # %bb.0: 3628; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3629; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3630; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3631; AVX1-NEXT: vzeroupper 3632; AVX1-NEXT: retq 3633; 3634; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: 3635; AVX2-SLOW: # %bb.0: 3636; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3637; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3638; AVX2-SLOW-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3639; AVX2-SLOW-NEXT: vzeroupper 3640; AVX2-SLOW-NEXT: retq 3641; 3642; AVX2-FAST-ALL-LABEL: trunc_xor_const_v4i64_v4i32: 3643; AVX2-FAST-ALL: # %bb.0: 3644; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3645; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3646; AVX2-FAST-ALL-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3647; AVX2-FAST-ALL-NEXT: vzeroupper 3648; AVX2-FAST-ALL-NEXT: retq 3649; 3650; AVX2-FAST-PERLANE-LABEL: trunc_xor_const_v4i64_v4i32: 3651; AVX2-FAST-PERLANE: # %bb.0: 3652; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3653; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3654; AVX2-FAST-PERLANE-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3655; AVX2-FAST-PERLANE-NEXT: vzeroupper 3656; AVX2-FAST-PERLANE-NEXT: retq 3657; 3658; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: 3659; AVX512: # %bb.0: 3660; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3661; AVX512-NEXT: vpmovqd %zmm0, %ymm0 3662; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3663; AVX512-NEXT: vzeroupper 3664; AVX512-NEXT: retq 3665 %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 3666 %2 = trunc <4 x i64> %1 to <4 x i32> 3667 ret <4 x i32> %2 3668} 3669 3670define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 3671; SSE-LABEL: trunc_xor_const_v8i64_v8i16: 3672; SSE: # %bb.0: 3673; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 3674; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 3675; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 3676; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 3677; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 3678; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 3679; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 3680; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 3681; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 3682; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3683; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 3684; SSE-NEXT: xorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3685; SSE-NEXT: retq 3686; 3687; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: 3688; AVX1: # %bb.0: 3689; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 3690; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3691; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3692; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3693; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3694; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3695; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3696; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3697; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3698; AVX1-NEXT: vzeroupper 3699; AVX1-NEXT: retq 3700; 3701; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: 3702; AVX2: # %bb.0: 3703; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 3704; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 3705; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 3706; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3707; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3708; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3709; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3710; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3711; AVX2-NEXT: vzeroupper 3712; AVX2-NEXT: retq 3713; 3714; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: 3715; AVX512: # %bb.0: 3716; AVX512-NEXT: vpmovqw %zmm0, %xmm0 3717; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3718; AVX512-NEXT: vzeroupper 3719; AVX512-NEXT: retq 3720 %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 3721 %2 = trunc <8 x i64> %1 to <8 x i16> 3722 ret <8 x i16> %2 3723} 3724 3725define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 3726; SSE-LABEL: trunc_xor_const_v8i32_v8i16: 3727; SSE: # %bb.0: 3728; SSE-NEXT: pslld $16, %xmm1 3729; SSE-NEXT: psrad $16, %xmm1 3730; SSE-NEXT: pslld $16, %xmm0 3731; SSE-NEXT: psrad $16, %xmm0 3732; SSE-NEXT: packssdw %xmm1, %xmm0 3733; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3734; SSE-NEXT: retq 3735; 3736; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: 3737; AVX1: # %bb.0: 3738; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3739; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 3740; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 3741; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 3742; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3743; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3744; AVX1-NEXT: vzeroupper 3745; AVX1-NEXT: retq 3746; 3747; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: 3748; AVX2: # %bb.0: 3749; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 3750; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 3751; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3752; AVX2-NEXT: vzeroupper 3753; AVX2-NEXT: retq 3754; 3755; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: 3756; AVX512: # %bb.0: 3757; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3758; AVX512-NEXT: vpmovdw %zmm0, %ymm0 3759; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3760; AVX512-NEXT: vzeroupper 3761; AVX512-NEXT: retq 3762 %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 3763 %2 = trunc <8 x i32> %1 to <8 x i16> 3764 ret <8 x i16> %2 3765} 3766 3767define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 3768; SSE-LABEL: trunc_xor_const_v16i64_v16i8: 3769; SSE: # %bb.0: 3770; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 3771; SSE-NEXT: pand %xmm8, %xmm7 3772; SSE-NEXT: pand %xmm8, %xmm6 3773; SSE-NEXT: packuswb %xmm7, %xmm6 3774; SSE-NEXT: pand %xmm8, %xmm5 3775; SSE-NEXT: pand %xmm8, %xmm4 3776; SSE-NEXT: packuswb %xmm5, %xmm4 3777; SSE-NEXT: packuswb %xmm6, %xmm4 3778; SSE-NEXT: pand %xmm8, %xmm3 3779; SSE-NEXT: pand %xmm8, %xmm2 3780; SSE-NEXT: packuswb %xmm3, %xmm2 3781; SSE-NEXT: pand %xmm8, %xmm1 3782; SSE-NEXT: pand %xmm8, %xmm0 3783; SSE-NEXT: packuswb %xmm1, %xmm0 3784; SSE-NEXT: packuswb %xmm2, %xmm0 3785; SSE-NEXT: packuswb %xmm4, %xmm0 3786; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3787; SSE-NEXT: retq 3788; 3789; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: 3790; AVX1: # %bb.0: 3791; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 3792; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 3793; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 3794; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 3795; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 3796; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 3797; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 3798; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 3799; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 3800; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3801; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3802; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 3803; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 3804; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 3805; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 3806; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 3807; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3808; AVX1-NEXT: vzeroupper 3809; AVX1-NEXT: retq 3810; 3811; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: 3812; AVX2: # %bb.0: 3813; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 3814; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 3815; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 3816; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 3817; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 3818; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 3819; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 3820; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3821; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 3822; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 3823; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3824; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3825; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3826; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3827; AVX2-NEXT: vzeroupper 3828; AVX2-NEXT: retq 3829; 3830; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: 3831; AVX512: # %bb.0: 3832; AVX512-NEXT: vpmovqb %zmm1, %xmm1 3833; AVX512-NEXT: vpmovqb %zmm0, %xmm0 3834; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 3835; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3836; AVX512-NEXT: vzeroupper 3837; AVX512-NEXT: retq 3838 %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 3839 %2 = trunc <16 x i64> %1 to <16 x i8> 3840 ret <16 x i8> %2 3841} 3842 3843define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 3844; SSE-LABEL: trunc_xor_const_v16i32_v16i8: 3845; SSE: # %bb.0: 3846; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 3847; SSE-NEXT: pand %xmm4, %xmm3 3848; SSE-NEXT: pand %xmm4, %xmm2 3849; SSE-NEXT: packuswb %xmm3, %xmm2 3850; SSE-NEXT: pand %xmm4, %xmm1 3851; SSE-NEXT: pand %xmm4, %xmm0 3852; SSE-NEXT: packuswb %xmm1, %xmm0 3853; SSE-NEXT: packuswb %xmm2, %xmm0 3854; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3855; SSE-NEXT: retq 3856; 3857; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: 3858; AVX1: # %bb.0: 3859; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3860; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 3861; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 3862; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 3863; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 3864; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 3865; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 3866; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3867; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3868; AVX1-NEXT: vzeroupper 3869; AVX1-NEXT: retq 3870; 3871; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: 3872; AVX2: # %bb.0: 3873; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 3874; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 3875; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 3876; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 3877; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3878; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3879; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 3880; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3881; AVX2-NEXT: vzeroupper 3882; AVX2-NEXT: retq 3883; 3884; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: 3885; AVX512: # %bb.0: 3886; AVX512-NEXT: vpmovdb %zmm0, %xmm0 3887; AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3888; AVX512-NEXT: vzeroupper 3889; AVX512-NEXT: retq 3890 %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 3891 %2 = trunc <16 x i32> %1 to <16 x i8> 3892 ret <16 x i8> %2 3893} 3894 3895define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 3896; SSE-LABEL: trunc_xor_const_v16i16_v16i8: 3897; SSE: # %bb.0: 3898; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 3899; SSE-NEXT: pand %xmm2, %xmm1 3900; SSE-NEXT: pand %xmm2, %xmm0 3901; SSE-NEXT: packuswb %xmm1, %xmm0 3902; SSE-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 3903; SSE-NEXT: retq 3904; 3905; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: 3906; AVX1: # %bb.0: 3907; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3908; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3909; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3910; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3911; AVX1-NEXT: vzeroupper 3912; AVX1-NEXT: retq 3913; 3914; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: 3915; AVX2: # %bb.0: 3916; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 3917; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 3918; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 3919; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3920; AVX2-NEXT: vzeroupper 3921; AVX2-NEXT: retq 3922; 3923; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: 3924; AVX512F: # %bb.0: 3925; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3926; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 3927; AVX512F-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3928; AVX512F-NEXT: vzeroupper 3929; AVX512F-NEXT: retq 3930; 3931; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: 3932; AVX512BW: # %bb.0: 3933; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 3934; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 3935; AVX512BW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3936; AVX512BW-NEXT: vzeroupper 3937; AVX512BW-NEXT: retq 3938; 3939; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: 3940; AVX512DQ: # %bb.0: 3941; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 3942; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 3943; AVX512DQ-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 3944; AVX512DQ-NEXT: vzeroupper 3945; AVX512DQ-NEXT: retq 3946 %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 3947 %2 = trunc <16 x i16> %1 to <16 x i8> 3948 ret <16 x i8> %2 3949} 3950 3951; 3952; or 3953; 3954 3955define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { 3956; SSE-LABEL: trunc_or_v4i64_v4i32: 3957; SSE: # %bb.0: 3958; SSE-NEXT: orps %xmm3, %xmm1 3959; SSE-NEXT: orps %xmm2, %xmm0 3960; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3961; SSE-NEXT: retq 3962; 3963; AVX1-LABEL: trunc_or_v4i64_v4i32: 3964; AVX1: # %bb.0: 3965; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 3966; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 3967; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3968; AVX1-NEXT: vzeroupper 3969; AVX1-NEXT: retq 3970; 3971; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: 3972; AVX2-SLOW: # %bb.0: 3973; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 3974; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 3975; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3976; AVX2-SLOW-NEXT: vzeroupper 3977; AVX2-SLOW-NEXT: retq 3978; 3979; AVX2-FAST-ALL-LABEL: trunc_or_v4i64_v4i32: 3980; AVX2-FAST-ALL: # %bb.0: 3981; AVX2-FAST-ALL-NEXT: vorps %ymm1, %ymm0, %ymm0 3982; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 3983; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 3984; AVX2-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 3985; AVX2-FAST-ALL-NEXT: vzeroupper 3986; AVX2-FAST-ALL-NEXT: retq 3987; 3988; AVX2-FAST-PERLANE-LABEL: trunc_or_v4i64_v4i32: 3989; AVX2-FAST-PERLANE: # %bb.0: 3990; AVX2-FAST-PERLANE-NEXT: vorps %ymm1, %ymm0, %ymm0 3991; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 3992; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 3993; AVX2-FAST-PERLANE-NEXT: vzeroupper 3994; AVX2-FAST-PERLANE-NEXT: retq 3995; 3996; AVX512-LABEL: trunc_or_v4i64_v4i32: 3997; AVX512: # %bb.0: 3998; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 3999; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4000; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4001; AVX512-NEXT: vzeroupper 4002; AVX512-NEXT: retq 4003 %1 = or <4 x i64> %a0, %a1 4004 %2 = trunc <4 x i64> %1 to <4 x i32> 4005 ret <4 x i32> %2 4006} 4007 4008define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { 4009; SSE-LABEL: trunc_or_v8i64_v8i16: 4010; SSE: # %bb.0: 4011; SSE-NEXT: por %xmm6, %xmm2 4012; SSE-NEXT: por %xmm7, %xmm3 4013; SSE-NEXT: por %xmm4, %xmm0 4014; SSE-NEXT: por %xmm5, %xmm1 4015; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4016; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4017; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4018; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4019; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4020; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4021; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4022; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4023; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4024; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4025; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4026; SSE-NEXT: retq 4027; 4028; AVX1-LABEL: trunc_or_v8i64_v8i16: 4029; AVX1: # %bb.0: 4030; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4031; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4032; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4033; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4034; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4035; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4036; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4037; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4038; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4039; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4040; AVX1-NEXT: vzeroupper 4041; AVX1-NEXT: retq 4042; 4043; AVX2-LABEL: trunc_or_v8i64_v8i16: 4044; AVX2: # %bb.0: 4045; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4046; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4047; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4048; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4049; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4050; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4051; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4052; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4053; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4054; AVX2-NEXT: vzeroupper 4055; AVX2-NEXT: retq 4056; 4057; AVX512-LABEL: trunc_or_v8i64_v8i16: 4058; AVX512: # %bb.0: 4059; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 4060; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4061; AVX512-NEXT: vzeroupper 4062; AVX512-NEXT: retq 4063 %1 = or <8 x i64> %a0, %a1 4064 %2 = trunc <8 x i64> %1 to <8 x i16> 4065 ret <8 x i16> %2 4066} 4067 4068define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { 4069; SSE-LABEL: trunc_or_v8i32_v8i16: 4070; SSE: # %bb.0: 4071; SSE-NEXT: por %xmm2, %xmm0 4072; SSE-NEXT: por %xmm3, %xmm1 4073; SSE-NEXT: pslld $16, %xmm1 4074; SSE-NEXT: psrad $16, %xmm1 4075; SSE-NEXT: pslld $16, %xmm0 4076; SSE-NEXT: psrad $16, %xmm0 4077; SSE-NEXT: packssdw %xmm1, %xmm0 4078; SSE-NEXT: retq 4079; 4080; AVX1-LABEL: trunc_or_v8i32_v8i16: 4081; AVX1: # %bb.0: 4082; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4083; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4084; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4085; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4086; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4087; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4088; AVX1-NEXT: vzeroupper 4089; AVX1-NEXT: retq 4090; 4091; AVX2-LABEL: trunc_or_v8i32_v8i16: 4092; AVX2: # %bb.0: 4093; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4094; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4095; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4096; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4097; AVX2-NEXT: vzeroupper 4098; AVX2-NEXT: retq 4099; 4100; AVX512-LABEL: trunc_or_v8i32_v8i16: 4101; AVX512: # %bb.0: 4102; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 4103; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4104; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4105; AVX512-NEXT: vzeroupper 4106; AVX512-NEXT: retq 4107 %1 = or <8 x i32> %a0, %a1 4108 %2 = trunc <8 x i32> %1 to <8 x i16> 4109 ret <8 x i16> %2 4110} 4111 4112define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { 4113; SSE-LABEL: trunc_or_v16i64_v16i8: 4114; SSE: # %bb.0: 4115; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 4116; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 4117; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 4118; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 4119; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 4120; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 4121; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 4122; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 4123; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4124; SSE-NEXT: pand %xmm8, %xmm7 4125; SSE-NEXT: pand %xmm8, %xmm6 4126; SSE-NEXT: packuswb %xmm7, %xmm6 4127; SSE-NEXT: pand %xmm8, %xmm5 4128; SSE-NEXT: pand %xmm8, %xmm4 4129; SSE-NEXT: packuswb %xmm5, %xmm4 4130; SSE-NEXT: packuswb %xmm6, %xmm4 4131; SSE-NEXT: pand %xmm8, %xmm3 4132; SSE-NEXT: pand %xmm8, %xmm2 4133; SSE-NEXT: packuswb %xmm3, %xmm2 4134; SSE-NEXT: pand %xmm8, %xmm1 4135; SSE-NEXT: pand %xmm8, %xmm0 4136; SSE-NEXT: packuswb %xmm1, %xmm0 4137; SSE-NEXT: packuswb %xmm2, %xmm0 4138; SSE-NEXT: packuswb %xmm4, %xmm0 4139; SSE-NEXT: retq 4140; 4141; AVX1-LABEL: trunc_or_v16i64_v16i8: 4142; AVX1: # %bb.0: 4143; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 4144; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 4145; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 4146; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 4147; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4148; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4149; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4150; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4151; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4152; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4153; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4154; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4155; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4156; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4157; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4158; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4159; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4160; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4161; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4162; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4163; AVX1-NEXT: vzeroupper 4164; AVX1-NEXT: retq 4165; 4166; AVX2-LABEL: trunc_or_v16i64_v16i8: 4167; AVX2: # %bb.0: 4168; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 4169; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 4170; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 4171; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 4172; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4173; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4174; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4175; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4176; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4177; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4178; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4179; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4180; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4181; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4182; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4183; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4184; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4185; AVX2-NEXT: vzeroupper 4186; AVX2-NEXT: retq 4187; 4188; AVX512-LABEL: trunc_or_v16i64_v16i8: 4189; AVX512: # %bb.0: 4190; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 4191; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 4192; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4193; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4194; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4195; AVX512-NEXT: vzeroupper 4196; AVX512-NEXT: retq 4197 %1 = or <16 x i64> %a0, %a1 4198 %2 = trunc <16 x i64> %1 to <16 x i8> 4199 ret <16 x i8> %2 4200} 4201 4202define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { 4203; SSE-LABEL: trunc_or_v16i32_v16i8: 4204; SSE: # %bb.0: 4205; SSE-NEXT: por %xmm4, %xmm0 4206; SSE-NEXT: por %xmm5, %xmm1 4207; SSE-NEXT: por %xmm6, %xmm2 4208; SSE-NEXT: por %xmm7, %xmm3 4209; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4210; SSE-NEXT: pand %xmm4, %xmm3 4211; SSE-NEXT: pand %xmm4, %xmm2 4212; SSE-NEXT: packuswb %xmm3, %xmm2 4213; SSE-NEXT: pand %xmm4, %xmm1 4214; SSE-NEXT: pand %xmm4, %xmm0 4215; SSE-NEXT: packuswb %xmm1, %xmm0 4216; SSE-NEXT: packuswb %xmm2, %xmm0 4217; SSE-NEXT: retq 4218; 4219; AVX1-LABEL: trunc_or_v16i32_v16i8: 4220; AVX1: # %bb.0: 4221; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 4222; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 4223; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4224; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4225; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4226; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4227; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4228; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4229; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4230; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4231; AVX1-NEXT: vzeroupper 4232; AVX1-NEXT: retq 4233; 4234; AVX2-LABEL: trunc_or_v16i32_v16i8: 4235; AVX2: # %bb.0: 4236; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 4237; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 4238; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4239; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4240; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4241; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4242; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4243; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4244; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4245; AVX2-NEXT: vzeroupper 4246; AVX2-NEXT: retq 4247; 4248; AVX512-LABEL: trunc_or_v16i32_v16i8: 4249; AVX512: # %bb.0: 4250; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 4251; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4252; AVX512-NEXT: vzeroupper 4253; AVX512-NEXT: retq 4254 %1 = or <16 x i32> %a0, %a1 4255 %2 = trunc <16 x i32> %1 to <16 x i8> 4256 ret <16 x i8> %2 4257} 4258 4259define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { 4260; SSE-LABEL: trunc_or_v16i16_v16i8: 4261; SSE: # %bb.0: 4262; SSE-NEXT: por %xmm2, %xmm0 4263; SSE-NEXT: por %xmm3, %xmm1 4264; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4265; SSE-NEXT: pand %xmm2, %xmm1 4266; SSE-NEXT: pand %xmm2, %xmm0 4267; SSE-NEXT: packuswb %xmm1, %xmm0 4268; SSE-NEXT: retq 4269; 4270; AVX1-LABEL: trunc_or_v16i16_v16i8: 4271; AVX1: # %bb.0: 4272; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 4273; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4274; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4275; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4276; AVX1-NEXT: vzeroupper 4277; AVX1-NEXT: retq 4278; 4279; AVX2-LABEL: trunc_or_v16i16_v16i8: 4280; AVX2: # %bb.0: 4281; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 4282; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4283; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4284; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4285; AVX2-NEXT: vzeroupper 4286; AVX2-NEXT: retq 4287; 4288; AVX512F-LABEL: trunc_or_v16i16_v16i8: 4289; AVX512F: # %bb.0: 4290; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 4291; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4292; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4293; AVX512F-NEXT: vzeroupper 4294; AVX512F-NEXT: retq 4295; 4296; AVX512BW-LABEL: trunc_or_v16i16_v16i8: 4297; AVX512BW: # %bb.0: 4298; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 4299; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4300; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 4301; AVX512BW-NEXT: vzeroupper 4302; AVX512BW-NEXT: retq 4303; 4304; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: 4305; AVX512DQ: # %bb.0: 4306; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 4307; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4308; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4309; AVX512DQ-NEXT: vzeroupper 4310; AVX512DQ-NEXT: retq 4311 %1 = or <16 x i16> %a0, %a1 4312 %2 = trunc <16 x i16> %1 to <16 x i8> 4313 ret <16 x i8> %2 4314} 4315 4316; 4317; or to constant 4318; 4319 4320define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { 4321; SSE-LABEL: trunc_or_const_v4i64_v4i32: 4322; SSE: # %bb.0: 4323; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4324; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4325; SSE-NEXT: retq 4326; 4327; AVX1-LABEL: trunc_or_const_v4i64_v4i32: 4328; AVX1: # %bb.0: 4329; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4330; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4331; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4332; AVX1-NEXT: vzeroupper 4333; AVX1-NEXT: retq 4334; 4335; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: 4336; AVX2-SLOW: # %bb.0: 4337; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 4338; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4339; AVX2-SLOW-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4340; AVX2-SLOW-NEXT: vzeroupper 4341; AVX2-SLOW-NEXT: retq 4342; 4343; AVX2-FAST-ALL-LABEL: trunc_or_const_v4i64_v4i32: 4344; AVX2-FAST-ALL: # %bb.0: 4345; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = <0,2,4,6,u,u,u,u> 4346; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 4347; AVX2-FAST-ALL-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4348; AVX2-FAST-ALL-NEXT: vzeroupper 4349; AVX2-FAST-ALL-NEXT: retq 4350; 4351; AVX2-FAST-PERLANE-LABEL: trunc_or_const_v4i64_v4i32: 4352; AVX2-FAST-PERLANE: # %bb.0: 4353; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1 4354; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4355; AVX2-FAST-PERLANE-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4356; AVX2-FAST-PERLANE-NEXT: vzeroupper 4357; AVX2-FAST-PERLANE-NEXT: retq 4358; 4359; AVX512-LABEL: trunc_or_const_v4i64_v4i32: 4360; AVX512: # %bb.0: 4361; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4362; AVX512-NEXT: vpmovqd %zmm0, %ymm0 4363; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4364; AVX512-NEXT: vzeroupper 4365; AVX512-NEXT: retq 4366 %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3> 4367 %2 = trunc <4 x i64> %1 to <4 x i32> 4368 ret <4 x i32> %2 4369} 4370 4371define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { 4372; SSE-LABEL: trunc_or_const_v8i64_v8i16: 4373; SSE: # %bb.0: 4374; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 4375; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] 4376; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 4377; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] 4378; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4379; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] 4380; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] 4381; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 4382; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] 4383; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 4384; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] 4385; SSE-NEXT: orpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4386; SSE-NEXT: retq 4387; 4388; AVX1-LABEL: trunc_or_const_v8i64_v8i16: 4389; AVX1: # %bb.0: 4390; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] 4391; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4392; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4393; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4394; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4395; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4396; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4397; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4398; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4399; AVX1-NEXT: vzeroupper 4400; AVX1-NEXT: retq 4401; 4402; AVX2-LABEL: trunc_or_const_v8i64_v8i16: 4403; AVX2: # %bb.0: 4404; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 4405; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15] 4406; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7],ymm0[8],ymm2[9,10,11],ymm0[12],ymm2[13,14,15] 4407; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4408; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4409; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4410; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4411; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4412; AVX2-NEXT: vzeroupper 4413; AVX2-NEXT: retq 4414; 4415; AVX512-LABEL: trunc_or_const_v8i64_v8i16: 4416; AVX512: # %bb.0: 4417; AVX512-NEXT: vpmovqw %zmm0, %xmm0 4418; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4419; AVX512-NEXT: vzeroupper 4420; AVX512-NEXT: retq 4421 %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7> 4422 %2 = trunc <8 x i64> %1 to <8 x i16> 4423 ret <8 x i16> %2 4424} 4425 4426define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { 4427; SSE-LABEL: trunc_or_const_v8i32_v8i16: 4428; SSE: # %bb.0: 4429; SSE-NEXT: pslld $16, %xmm1 4430; SSE-NEXT: psrad $16, %xmm1 4431; SSE-NEXT: pslld $16, %xmm0 4432; SSE-NEXT: psrad $16, %xmm0 4433; SSE-NEXT: packssdw %xmm1, %xmm0 4434; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4435; SSE-NEXT: retq 4436; 4437; AVX1-LABEL: trunc_or_const_v8i32_v8i16: 4438; AVX1: # %bb.0: 4439; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4440; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u> 4441; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 4442; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 4443; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4444; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4445; AVX1-NEXT: vzeroupper 4446; AVX1-NEXT: retq 4447; 4448; AVX2-LABEL: trunc_or_const_v8i32_v8i16: 4449; AVX2: # %bb.0: 4450; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u] 4451; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 4452; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4453; AVX2-NEXT: vzeroupper 4454; AVX2-NEXT: retq 4455; 4456; AVX512-LABEL: trunc_or_const_v8i32_v8i16: 4457; AVX512: # %bb.0: 4458; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4459; AVX512-NEXT: vpmovdw %zmm0, %ymm0 4460; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4461; AVX512-NEXT: vzeroupper 4462; AVX512-NEXT: retq 4463 %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 4464 %2 = trunc <8 x i32> %1 to <8 x i16> 4465 ret <8 x i16> %2 4466} 4467 4468define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { 4469; SSE-LABEL: trunc_or_const_v16i64_v16i8: 4470; SSE: # %bb.0: 4471; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] 4472; SSE-NEXT: pand %xmm8, %xmm7 4473; SSE-NEXT: pand %xmm8, %xmm6 4474; SSE-NEXT: packuswb %xmm7, %xmm6 4475; SSE-NEXT: pand %xmm8, %xmm5 4476; SSE-NEXT: pand %xmm8, %xmm4 4477; SSE-NEXT: packuswb %xmm5, %xmm4 4478; SSE-NEXT: packuswb %xmm6, %xmm4 4479; SSE-NEXT: pand %xmm8, %xmm3 4480; SSE-NEXT: pand %xmm8, %xmm2 4481; SSE-NEXT: packuswb %xmm3, %xmm2 4482; SSE-NEXT: pand %xmm8, %xmm1 4483; SSE-NEXT: pand %xmm8, %xmm0 4484; SSE-NEXT: packuswb %xmm1, %xmm0 4485; SSE-NEXT: packuswb %xmm2, %xmm0 4486; SSE-NEXT: packuswb %xmm4, %xmm0 4487; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4488; SSE-NEXT: retq 4489; 4490; AVX1-LABEL: trunc_or_const_v16i64_v16i8: 4491; AVX1: # %bb.0: 4492; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] 4493; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 4494; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 4495; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 4496; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 4497; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 4498; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 4499; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 4500; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 4501; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4502; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4503; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 4504; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 4505; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 4506; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 4507; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 4508; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4509; AVX1-NEXT: vzeroupper 4510; AVX1-NEXT: retq 4511; 4512; AVX2-LABEL: trunc_or_const_v16i64_v16i8: 4513; AVX2: # %bb.0: 4514; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] 4515; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 4516; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 4517; AVX2-NEXT: vpackusdw %ymm3, %ymm2, %ymm2 4518; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 4519; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 4520; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 4521; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4522; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 4523; AVX2-NEXT: vpackusdw %ymm2, %ymm0, %ymm0 4524; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4525; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4526; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4527; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4528; AVX2-NEXT: vzeroupper 4529; AVX2-NEXT: retq 4530; 4531; AVX512-LABEL: trunc_or_const_v16i64_v16i8: 4532; AVX512: # %bb.0: 4533; AVX512-NEXT: vpmovqb %zmm1, %xmm1 4534; AVX512-NEXT: vpmovqb %zmm0, %xmm0 4535; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 4536; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4537; AVX512-NEXT: vzeroupper 4538; AVX512-NEXT: retq 4539 %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15> 4540 %2 = trunc <16 x i64> %1 to <16 x i8> 4541 ret <16 x i8> %2 4542} 4543 4544define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { 4545; SSE-LABEL: trunc_or_const_v16i32_v16i8: 4546; SSE: # %bb.0: 4547; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] 4548; SSE-NEXT: pand %xmm4, %xmm3 4549; SSE-NEXT: pand %xmm4, %xmm2 4550; SSE-NEXT: packuswb %xmm3, %xmm2 4551; SSE-NEXT: pand %xmm4, %xmm1 4552; SSE-NEXT: pand %xmm4, %xmm0 4553; SSE-NEXT: packuswb %xmm1, %xmm0 4554; SSE-NEXT: packuswb %xmm2, %xmm0 4555; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4556; SSE-NEXT: retq 4557; 4558; AVX1-LABEL: trunc_or_const_v16i32_v16i8: 4559; AVX1: # %bb.0: 4560; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4561; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 4562; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 4563; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 4564; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 4565; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 4566; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 4567; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4568; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4569; AVX1-NEXT: vzeroupper 4570; AVX1-NEXT: retq 4571; 4572; AVX2-LABEL: trunc_or_const_v16i32_v16i8: 4573; AVX2: # %bb.0: 4574; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] 4575; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 4576; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 4577; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 4578; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4579; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4580; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 4581; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4582; AVX2-NEXT: vzeroupper 4583; AVX2-NEXT: retq 4584; 4585; AVX512-LABEL: trunc_or_const_v16i32_v16i8: 4586; AVX512: # %bb.0: 4587; AVX512-NEXT: vpmovdb %zmm0, %xmm0 4588; AVX512-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4589; AVX512-NEXT: vzeroupper 4590; AVX512-NEXT: retq 4591 %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 4592 %2 = trunc <16 x i32> %1 to <16 x i8> 4593 ret <16 x i8> %2 4594} 4595 4596define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { 4597; SSE-LABEL: trunc_or_const_v16i16_v16i8: 4598; SSE: # %bb.0: 4599; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] 4600; SSE-NEXT: pand %xmm2, %xmm1 4601; SSE-NEXT: pand %xmm2, %xmm0 4602; SSE-NEXT: packuswb %xmm1, %xmm0 4603; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4604; SSE-NEXT: retq 4605; 4606; AVX1-LABEL: trunc_or_const_v16i16_v16i8: 4607; AVX1: # %bb.0: 4608; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4609; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 4610; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4611; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4612; AVX1-NEXT: vzeroupper 4613; AVX1-NEXT: retq 4614; 4615; AVX2-LABEL: trunc_or_const_v16i16_v16i8: 4616; AVX2: # %bb.0: 4617; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 4618; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 4619; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 4620; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4621; AVX2-NEXT: vzeroupper 4622; AVX2-NEXT: retq 4623; 4624; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: 4625; AVX512F: # %bb.0: 4626; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4627; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 4628; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4629; AVX512F-NEXT: vzeroupper 4630; AVX512F-NEXT: retq 4631; 4632; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: 4633; AVX512BW: # %bb.0: 4634; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 4635; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 4636; AVX512BW-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4637; AVX512BW-NEXT: vzeroupper 4638; AVX512BW-NEXT: retq 4639; 4640; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: 4641; AVX512DQ: # %bb.0: 4642; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 4643; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 4644; AVX512DQ-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4645; AVX512DQ-NEXT: vzeroupper 4646; AVX512DQ-NEXT: retq 4647 %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15> 4648 %2 = trunc <16 x i16> %1 to <16 x i8> 4649 ret <16 x i8> %2 4650} 4651 4652; 4653; complex patterns - often created by vectorizer 4654; 4655 4656define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4657; SSE-LABEL: mul_add_const_v4i64_v4i32: 4658; SSE: # %bb.0: 4659; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4660; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4661; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4662; SSE-NEXT: pmuludq %xmm2, %xmm0 4663; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4664; SSE-NEXT: pmuludq %xmm3, %xmm1 4665; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4666; SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 4667; SSE-NEXT: retq 4668; 4669; AVX-LABEL: mul_add_const_v4i64_v4i32: 4670; AVX: # %bb.0: 4671; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4672; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4673; AVX-NEXT: retq 4674 %1 = sext <4 x i32> %a0 to <4 x i64> 4675 %2 = sext <4 x i32> %a1 to <4 x i64> 4676 %3 = mul <4 x i64> %1, %2 4677 %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3> 4678 %5 = trunc <4 x i64> %4 to <4 x i32> 4679 ret <4 x i32> %5 4680} 4681 4682define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4683; SSE-LABEL: mul_add_self_v4i64_v4i32: 4684; SSE: # %bb.0: 4685; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4686; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4687; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 4688; SSE-NEXT: pmuludq %xmm2, %xmm0 4689; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4690; SSE-NEXT: pmuludq %xmm3, %xmm1 4691; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 4692; SSE-NEXT: paddd %xmm0, %xmm0 4693; SSE-NEXT: retq 4694; 4695; AVX-LABEL: mul_add_self_v4i64_v4i32: 4696; AVX: # %bb.0: 4697; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 4698; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 4699; AVX-NEXT: retq 4700 %1 = sext <4 x i32> %a0 to <4 x i64> 4701 %2 = sext <4 x i32> %a1 to <4 x i64> 4702 %3 = mul <4 x i64> %1, %2 4703 %4 = add <4 x i64> %3, %3 4704 %5 = trunc <4 x i64> %4 to <4 x i32> 4705 ret <4 x i32> %5 4706} 4707 4708define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { 4709; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: 4710; SSE: # %bb.0: 4711; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 4712; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,3,3] 4713; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] 4714; SSE-NEXT: pmuludq %xmm2, %xmm4 4715; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,3,3] 4716; SSE-NEXT: pmuludq %xmm3, %xmm1 4717; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] 4718; SSE-NEXT: paddd %xmm4, %xmm0 4719; SSE-NEXT: retq 4720; 4721; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: 4722; AVX: # %bb.0: 4723; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 4724; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 4725; AVX-NEXT: retq 4726 %1 = sext <4 x i32> %a0 to <4 x i64> 4727 %2 = sext <4 x i32> %a1 to <4 x i64> 4728 %3 = mul <4 x i64> %1, %2 4729 %4 = add <4 x i64> %1, %3 4730 %5 = trunc <4 x i64> %4 to <4 x i32> 4731 ret <4 x i32> %5 4732} 4733