1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 11 12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) 13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>) 14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>) 15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) 16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>) 17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>) 18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>) 19 20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>) 21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>) 22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>) 23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>) 24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>) 25 26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>) 27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>) 28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>) 29 30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, <1 x i32>* %p2) nounwind { 31; CHECK-LABEL: umulo_v1i32: 32; CHECK: # %bb.0: 33; CHECK-NEXT: movq %rdx, %rcx 34; CHECK-NEXT: movl %edi, %eax 35; CHECK-NEXT: xorl %edi, %edi 36; CHECK-NEXT: mull %esi 37; CHECK-NEXT: seto %dil 38; CHECK-NEXT: negl %edi 39; CHECK-NEXT: movl %eax, (%rcx) 40; CHECK-NEXT: movl %edi, %eax 41; CHECK-NEXT: retq 42 %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 43 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 44 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 45 %res = sext <1 x i1> %obit to <1 x i32> 46 store <1 x i32> %val, <1 x i32>* %p2 47 ret <1 x i32> %res 48} 49 50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>* %p2) nounwind { 51; SSE2-LABEL: umulo_v2i32: 52; SSE2: # %bb.0: 53; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 54; SSE2-NEXT: pmuludq %xmm1, %xmm0 55; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 56; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 57; SSE2-NEXT: pmuludq %xmm2, %xmm4 58; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 59; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 60; SSE2-NEXT: pxor %xmm2, %xmm2 61; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 62; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 63; SSE2-NEXT: pxor %xmm2, %xmm1 64; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 65; SSE2-NEXT: movq %xmm0, (%rdi) 66; SSE2-NEXT: movdqa %xmm1, %xmm0 67; SSE2-NEXT: retq 68; 69; SSSE3-LABEL: umulo_v2i32: 70; SSSE3: # %bb.0: 71; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 72; SSSE3-NEXT: pmuludq %xmm1, %xmm0 73; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 74; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 75; SSSE3-NEXT: pmuludq %xmm2, %xmm4 76; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 77; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 78; SSSE3-NEXT: pxor %xmm2, %xmm2 79; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 80; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 81; SSSE3-NEXT: pxor %xmm2, %xmm1 82; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 83; SSSE3-NEXT: movq %xmm0, (%rdi) 84; SSSE3-NEXT: movdqa %xmm1, %xmm0 85; SSSE3-NEXT: retq 86; 87; SSE41-LABEL: umulo_v2i32: 88; SSE41: # %bb.0: 89; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 90; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 91; SSE41-NEXT: pmuludq %xmm2, %xmm3 92; SSE41-NEXT: movdqa %xmm0, %xmm2 93; SSE41-NEXT: pmuludq %xmm1, %xmm2 94; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 95; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 96; SSE41-NEXT: pxor %xmm3, %xmm3 97; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 98; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 99; SSE41-NEXT: pxor %xmm3, %xmm2 100; SSE41-NEXT: pmulld %xmm1, %xmm0 101; SSE41-NEXT: movq %xmm0, (%rdi) 102; SSE41-NEXT: movdqa %xmm2, %xmm0 103; SSE41-NEXT: retq 104; 105; AVX1-LABEL: umulo_v2i32: 106; AVX1: # %bb.0: 107; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 108; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 109; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 110; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 111; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 112; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 113; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 114; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 115; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 116; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 117; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 118; AVX1-NEXT: vmovq %xmm0, (%rdi) 119; AVX1-NEXT: vmovdqa %xmm2, %xmm0 120; AVX1-NEXT: retq 121; 122; AVX2-LABEL: umulo_v2i32: 123; AVX2: # %bb.0: 124; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 125; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 126; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 127; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 128; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 129; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 130; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 131; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 132; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 133; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 134; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 135; AVX2-NEXT: vmovq %xmm0, (%rdi) 136; AVX2-NEXT: vmovdqa %xmm2, %xmm0 137; AVX2-NEXT: retq 138; 139; AVX512-LABEL: umulo_v2i32: 140; AVX512: # %bb.0: 141; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 142; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 143; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 144; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 145; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 146; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 147; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 148; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 149; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 150; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 151; AVX512-NEXT: vmovq %xmm1, (%rdi) 152; AVX512-NEXT: retq 153 %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 154 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 155 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 156 %res = sext <2 x i1> %obit to <2 x i32> 157 store <2 x i32> %val, <2 x i32>* %p2 158 ret <2 x i32> %res 159} 160 161define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, <3 x i32>* %p2) nounwind { 162; SSE2-LABEL: umulo_v3i32: 163; SSE2: # %bb.0: 164; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 165; SSE2-NEXT: pmuludq %xmm1, %xmm0 166; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 167; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 168; SSE2-NEXT: pmuludq %xmm2, %xmm4 169; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 170; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 171; SSE2-NEXT: pxor %xmm2, %xmm2 172; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 173; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 174; SSE2-NEXT: pxor %xmm2, %xmm1 175; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 176; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 177; SSE2-NEXT: movd %xmm2, 8(%rdi) 178; SSE2-NEXT: movq %xmm0, (%rdi) 179; SSE2-NEXT: movdqa %xmm1, %xmm0 180; SSE2-NEXT: retq 181; 182; SSSE3-LABEL: umulo_v3i32: 183; SSSE3: # %bb.0: 184; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 185; SSSE3-NEXT: pmuludq %xmm1, %xmm0 186; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 187; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 188; SSSE3-NEXT: pmuludq %xmm2, %xmm4 189; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 190; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 191; SSSE3-NEXT: pxor %xmm2, %xmm2 192; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 193; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 194; SSSE3-NEXT: pxor %xmm2, %xmm1 195; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 196; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 197; SSSE3-NEXT: movd %xmm2, 8(%rdi) 198; SSSE3-NEXT: movq %xmm0, (%rdi) 199; SSSE3-NEXT: movdqa %xmm1, %xmm0 200; SSSE3-NEXT: retq 201; 202; SSE41-LABEL: umulo_v3i32: 203; SSE41: # %bb.0: 204; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 205; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 206; SSE41-NEXT: pmuludq %xmm2, %xmm3 207; SSE41-NEXT: movdqa %xmm0, %xmm2 208; SSE41-NEXT: pmuludq %xmm1, %xmm2 209; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 210; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 211; SSE41-NEXT: pxor %xmm3, %xmm3 212; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 213; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 214; SSE41-NEXT: pxor %xmm3, %xmm2 215; SSE41-NEXT: pmulld %xmm1, %xmm0 216; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) 217; SSE41-NEXT: movq %xmm0, (%rdi) 218; SSE41-NEXT: movdqa %xmm2, %xmm0 219; SSE41-NEXT: retq 220; 221; AVX1-LABEL: umulo_v3i32: 222; AVX1: # %bb.0: 223; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 224; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 225; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 226; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 227; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 228; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 229; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 230; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 231; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 232; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 233; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 234; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) 235; AVX1-NEXT: vmovq %xmm0, (%rdi) 236; AVX1-NEXT: vmovdqa %xmm2, %xmm0 237; AVX1-NEXT: retq 238; 239; AVX2-LABEL: umulo_v3i32: 240; AVX2: # %bb.0: 241; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 242; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 243; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 244; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 245; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 246; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 247; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 248; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 249; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 250; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 251; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 252; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) 253; AVX2-NEXT: vmovq %xmm0, (%rdi) 254; AVX2-NEXT: vmovdqa %xmm2, %xmm0 255; AVX2-NEXT: retq 256; 257; AVX512-LABEL: umulo_v3i32: 258; AVX512: # %bb.0: 259; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 260; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 261; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 262; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 263; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 264; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 265; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 266; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 267; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 268; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 269; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 270; AVX512-NEXT: vmovq %xmm1, (%rdi) 271; AVX512-NEXT: retq 272 %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 273 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 274 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 275 %res = sext <3 x i1> %obit to <3 x i32> 276 store <3 x i32> %val, <3 x i32>* %p2 277 ret <3 x i32> %res 278} 279 280define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, <4 x i32>* %p2) nounwind { 281; SSE2-LABEL: umulo_v4i32: 282; SSE2: # %bb.0: 283; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 284; SSE2-NEXT: pmuludq %xmm1, %xmm0 285; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 286; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 287; SSE2-NEXT: pmuludq %xmm2, %xmm4 288; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 289; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 290; SSE2-NEXT: pxor %xmm2, %xmm2 291; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 292; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 293; SSE2-NEXT: pxor %xmm2, %xmm1 294; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 295; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 296; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 297; SSE2-NEXT: movdqa %xmm0, (%rdi) 298; SSE2-NEXT: movdqa %xmm1, %xmm0 299; SSE2-NEXT: retq 300; 301; SSSE3-LABEL: umulo_v4i32: 302; SSSE3: # %bb.0: 303; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 304; SSSE3-NEXT: pmuludq %xmm1, %xmm0 305; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 306; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 307; SSSE3-NEXT: pmuludq %xmm2, %xmm4 308; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 309; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 310; SSSE3-NEXT: pxor %xmm2, %xmm2 311; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 312; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 313; SSSE3-NEXT: pxor %xmm2, %xmm1 314; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 315; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 316; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 317; SSSE3-NEXT: movdqa %xmm0, (%rdi) 318; SSSE3-NEXT: movdqa %xmm1, %xmm0 319; SSSE3-NEXT: retq 320; 321; SSE41-LABEL: umulo_v4i32: 322; SSE41: # %bb.0: 323; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 324; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 325; SSE41-NEXT: pmuludq %xmm2, %xmm3 326; SSE41-NEXT: movdqa %xmm0, %xmm2 327; SSE41-NEXT: pmuludq %xmm1, %xmm2 328; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 329; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 330; SSE41-NEXT: pxor %xmm3, %xmm3 331; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 332; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 333; SSE41-NEXT: pxor %xmm3, %xmm2 334; SSE41-NEXT: pmulld %xmm1, %xmm0 335; SSE41-NEXT: movdqa %xmm0, (%rdi) 336; SSE41-NEXT: movdqa %xmm2, %xmm0 337; SSE41-NEXT: retq 338; 339; AVX1-LABEL: umulo_v4i32: 340; AVX1: # %bb.0: 341; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 342; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 343; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 344; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 345; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 346; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 347; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 348; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 349; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 350; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 351; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 352; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 353; AVX1-NEXT: vmovdqa %xmm2, %xmm0 354; AVX1-NEXT: retq 355; 356; AVX2-LABEL: umulo_v4i32: 357; AVX2: # %bb.0: 358; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 359; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 360; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 361; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 362; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 363; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 364; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 365; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 366; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 367; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 368; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 369; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 370; AVX2-NEXT: vmovdqa %xmm2, %xmm0 371; AVX2-NEXT: retq 372; 373; AVX512-LABEL: umulo_v4i32: 374; AVX512: # %bb.0: 375; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 376; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 377; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 378; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 379; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 380; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 381; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 382; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 383; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 384; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 385; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 386; AVX512-NEXT: retq 387 %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 388 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 389 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 390 %res = sext <4 x i1> %obit to <4 x i32> 391 store <4 x i32> %val, <4 x i32>* %p2 392 ret <4 x i32> %res 393} 394 395define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, <6 x i32>* %p2) nounwind { 396; SSE2-LABEL: umulo_v6i32: 397; SSE2: # %bb.0: 398; SSE2-NEXT: movq %rdi, %rax 399; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 400; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 401; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 402; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 403; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 404; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 405; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 406; SSE2-NEXT: movd %r8d, %xmm0 407; SSE2-NEXT: movd %ecx, %xmm1 408; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 409; SSE2-NEXT: movd %edx, %xmm0 410; SSE2-NEXT: movd %esi, %xmm3 411; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 412; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 413; SSE2-NEXT: movd %r9d, %xmm1 414; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 415; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 416; SSE2-NEXT: pmuludq %xmm1, %xmm0 417; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 418; SSE2-NEXT: pmuludq %xmm2, %xmm3 419; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 420; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 421; SSE2-NEXT: pmuludq %xmm4, %xmm2 422; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 423; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 424; SSE2-NEXT: pxor %xmm4, %xmm4 425; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 426; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 427; SSE2-NEXT: pxor %xmm5, %xmm1 428; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 429; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 430; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 431; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 432; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 433; SSE2-NEXT: pmuludq %xmm2, %xmm6 434; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 435; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 436; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 437; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 438; SSE2-NEXT: pxor %xmm5, %xmm7 439; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 440; SSE2-NEXT: movq %xmm0, 16(%rcx) 441; SSE2-NEXT: movdqa %xmm3, (%rcx) 442; SSE2-NEXT: movq %xmm7, 16(%rdi) 443; SSE2-NEXT: movdqa %xmm1, (%rdi) 444; SSE2-NEXT: retq 445; 446; SSSE3-LABEL: umulo_v6i32: 447; SSSE3: # %bb.0: 448; SSSE3-NEXT: movq %rdi, %rax 449; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 450; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 451; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 452; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 453; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 454; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 455; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 456; SSSE3-NEXT: movd %r8d, %xmm0 457; SSSE3-NEXT: movd %ecx, %xmm1 458; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 459; SSSE3-NEXT: movd %edx, %xmm0 460; SSSE3-NEXT: movd %esi, %xmm3 461; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 462; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 463; SSSE3-NEXT: movd %r9d, %xmm1 464; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 465; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 466; SSSE3-NEXT: pmuludq %xmm1, %xmm0 467; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 468; SSSE3-NEXT: pmuludq %xmm2, %xmm3 469; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 470; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 471; SSSE3-NEXT: pmuludq %xmm4, %xmm2 472; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 473; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 474; SSSE3-NEXT: pxor %xmm4, %xmm4 475; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 476; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 477; SSSE3-NEXT: pxor %xmm5, %xmm1 478; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 479; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 480; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 481; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 482; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 483; SSSE3-NEXT: pmuludq %xmm2, %xmm6 484; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 485; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 486; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 487; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 488; SSSE3-NEXT: pxor %xmm5, %xmm7 489; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 490; SSSE3-NEXT: movq %xmm0, 16(%rcx) 491; SSSE3-NEXT: movdqa %xmm3, (%rcx) 492; SSSE3-NEXT: movq %xmm7, 16(%rdi) 493; SSSE3-NEXT: movdqa %xmm1, (%rdi) 494; SSSE3-NEXT: retq 495; 496; SSE41-LABEL: umulo_v6i32: 497; SSE41: # %bb.0: 498; SSE41-NEXT: movq %rdi, %rax 499; SSE41-NEXT: movd %esi, %xmm2 500; SSE41-NEXT: pinsrd $1, %edx, %xmm2 501; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 502; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 503; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 504; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 505; SSE41-NEXT: movdqa %xmm1, %xmm0 506; SSE41-NEXT: pmuludq %xmm2, %xmm1 507; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 508; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 509; SSE41-NEXT: movd %r9d, %xmm4 510; SSE41-NEXT: movdqa %xmm4, %xmm5 511; SSE41-NEXT: pmuludq %xmm3, %xmm4 512; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 513; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm5 514; SSE41-NEXT: pmulld %xmm3, %xmm5 515; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm0 516; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 517; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 518; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 519; SSE41-NEXT: pmuludq %xmm3, %xmm6 520; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 521; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 522; SSE41-NEXT: pxor %xmm8, %xmm8 523; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 524; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 525; SSE41-NEXT: pxor %xmm6, %xmm1 526; SSE41-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero 527; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 528; SSE41-NEXT: pmuludq %xmm7, %xmm3 529; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 530; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] 531; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 532; SSE41-NEXT: pxor %xmm6, %xmm4 533; SSE41-NEXT: pmulld %xmm2, %xmm0 534; SSE41-NEXT: movq %xmm5, 16(%rcx) 535; SSE41-NEXT: movdqa %xmm0, (%rcx) 536; SSE41-NEXT: movq %xmm4, 16(%rdi) 537; SSE41-NEXT: movdqa %xmm1, (%rdi) 538; SSE41-NEXT: retq 539; 540; AVX1-LABEL: umulo_v6i32: 541; AVX1: # %bb.0: 542; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 543; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 544; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 545; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 546; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 547; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 548; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 549; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 550; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 551; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 552; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 553; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 554; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 555; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 556; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 557; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 558; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 559; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 560; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 561; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 562; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 563; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 564; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 565; AVX1-NEXT: vmovq %xmm1, 16(%rdi) 566; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 567; AVX1-NEXT: vmovaps %ymm2, %ymm0 568; AVX1-NEXT: retq 569; 570; AVX2-LABEL: umulo_v6i32: 571; AVX2: # %bb.0: 572; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 573; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 574; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 575; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 576; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 577; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 578; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 579; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 580; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 581; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 582; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 583; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 584; AVX2-NEXT: vmovq %xmm1, 16(%rdi) 585; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 586; AVX2-NEXT: vmovdqa %ymm2, %ymm0 587; AVX2-NEXT: retq 588; 589; AVX512-LABEL: umulo_v6i32: 590; AVX512: # %bb.0: 591; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 592; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 593; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 594; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 595; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 596; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 597; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 598; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 599; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 600; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 601; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 602; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 603; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 604; AVX512-NEXT: retq 605 %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 606 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 607 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 608 %res = sext <6 x i1> %obit to <6 x i32> 609 store <6 x i32> %val, <6 x i32>* %p2 610 ret <6 x i32> %res 611} 612 613define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, <8 x i32>* %p2) nounwind { 614; SSE2-LABEL: umulo_v8i32: 615; SSE2: # %bb.0: 616; SSE2-NEXT: movdqa %xmm0, %xmm4 617; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 618; SSE2-NEXT: pmuludq %xmm2, %xmm4 619; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 620; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 621; SSE2-NEXT: pmuludq %xmm5, %xmm6 622; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 623; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 624; SSE2-NEXT: pxor %xmm8, %xmm8 625; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 626; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 627; SSE2-NEXT: pxor %xmm7, %xmm0 628; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 629; SSE2-NEXT: pmuludq %xmm3, %xmm1 630; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 631; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 632; SSE2-NEXT: pmuludq %xmm5, %xmm3 633; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 634; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 635; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 636; SSE2-NEXT: pxor %xmm7, %xmm2 637; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 638; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 639; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 640; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 641; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 642; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 643; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 644; SSE2-NEXT: movdqa %xmm4, (%rdi) 645; SSE2-NEXT: movdqa %xmm2, %xmm1 646; SSE2-NEXT: retq 647; 648; SSSE3-LABEL: umulo_v8i32: 649; SSSE3: # %bb.0: 650; SSSE3-NEXT: movdqa %xmm0, %xmm4 651; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 652; SSSE3-NEXT: pmuludq %xmm2, %xmm4 653; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 654; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 655; SSSE3-NEXT: pmuludq %xmm5, %xmm6 656; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 657; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 658; SSSE3-NEXT: pxor %xmm8, %xmm8 659; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 660; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 661; SSSE3-NEXT: pxor %xmm7, %xmm0 662; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 663; SSSE3-NEXT: pmuludq %xmm3, %xmm1 664; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 665; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 666; SSSE3-NEXT: pmuludq %xmm5, %xmm3 667; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 668; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 669; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 670; SSSE3-NEXT: pxor %xmm7, %xmm2 671; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 672; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 673; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 674; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 675; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 676; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 677; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 678; SSSE3-NEXT: movdqa %xmm4, (%rdi) 679; SSSE3-NEXT: movdqa %xmm2, %xmm1 680; SSSE3-NEXT: retq 681; 682; SSE41-LABEL: umulo_v8i32: 683; SSE41: # %bb.0: 684; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 685; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 686; SSE41-NEXT: pmuludq %xmm4, %xmm5 687; SSE41-NEXT: movdqa %xmm0, %xmm4 688; SSE41-NEXT: pmuludq %xmm2, %xmm4 689; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 690; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 691; SSE41-NEXT: pxor %xmm8, %xmm8 692; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 693; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 694; SSE41-NEXT: pxor %xmm7, %xmm4 695; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 696; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] 697; SSE41-NEXT: pmuludq %xmm5, %xmm6 698; SSE41-NEXT: movdqa %xmm1, %xmm5 699; SSE41-NEXT: pmuludq %xmm3, %xmm5 700; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 701; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 702; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 703; SSE41-NEXT: pxor %xmm7, %xmm5 704; SSE41-NEXT: pmulld %xmm2, %xmm0 705; SSE41-NEXT: pmulld %xmm3, %xmm1 706; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 707; SSE41-NEXT: movdqa %xmm0, (%rdi) 708; SSE41-NEXT: movdqa %xmm4, %xmm0 709; SSE41-NEXT: movdqa %xmm5, %xmm1 710; SSE41-NEXT: retq 711; 712; AVX1-LABEL: umulo_v8i32: 713; AVX1: # %bb.0: 714; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 715; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 716; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 717; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 718; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 719; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 720; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 721; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 722; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 723; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 724; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 725; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 726; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 727; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 728; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 729; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 730; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 731; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 732; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 733; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 734; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 735; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 736; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 737; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) 738; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 739; AVX1-NEXT: vmovaps %ymm2, %ymm0 740; AVX1-NEXT: retq 741; 742; AVX2-LABEL: umulo_v8i32: 743; AVX2: # %bb.0: 744; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 745; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 746; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 747; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 748; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 749; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 750; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 751; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 752; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 753; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 754; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 755; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 756; AVX2-NEXT: vmovdqa %ymm2, %ymm0 757; AVX2-NEXT: retq 758; 759; AVX512-LABEL: umulo_v8i32: 760; AVX512: # %bb.0: 761; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 762; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 763; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 764; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 765; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 766; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 767; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 768; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 769; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 770; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 771; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 772; AVX512-NEXT: retq 773 %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 774 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 775 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 776 %res = sext <8 x i1> %obit to <8 x i32> 777 store <8 x i32> %val, <8 x i32>* %p2 778 ret <8 x i32> %res 779} 780 781define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i32>* %p2) nounwind { 782; SSE2-LABEL: umulo_v16i32: 783; SSE2: # %bb.0: 784; SSE2-NEXT: movdqa %xmm0, %xmm8 785; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 786; SSE2-NEXT: pmuludq %xmm4, %xmm8 787; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 788; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 789; SSE2-NEXT: pmuludq %xmm10, %xmm9 790; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 791; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 792; SSE2-NEXT: pxor %xmm10, %xmm10 793; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 794; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 795; SSE2-NEXT: pxor %xmm11, %xmm0 796; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 797; SSE2-NEXT: pmuludq %xmm5, %xmm1 798; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 799; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 800; SSE2-NEXT: pmuludq %xmm13, %xmm12 801; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 802; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 803; SSE2-NEXT: pcmpeqd %xmm10, %xmm15 804; SSE2-NEXT: pxor %xmm11, %xmm15 805; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 806; SSE2-NEXT: pmuludq %xmm6, %xmm2 807; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 808; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 809; SSE2-NEXT: pmuludq %xmm14, %xmm13 810; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 811; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 812; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 813; SSE2-NEXT: pxor %xmm11, %xmm5 814; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 815; SSE2-NEXT: pmuludq %xmm7, %xmm3 816; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 817; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 818; SSE2-NEXT: pmuludq %xmm14, %xmm7 819; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 820; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 821; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 822; SSE2-NEXT: pxor %xmm11, %xmm6 823; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 824; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 825; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 826; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 827; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 828; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 829; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 830; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 831; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 832; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 833; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 834; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 835; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 836; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 837; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 838; SSE2-NEXT: movdqa %xmm8, (%rdi) 839; SSE2-NEXT: movdqa %xmm15, %xmm1 840; SSE2-NEXT: movdqa %xmm5, %xmm2 841; SSE2-NEXT: movdqa %xmm6, %xmm3 842; SSE2-NEXT: retq 843; 844; SSSE3-LABEL: umulo_v16i32: 845; SSSE3: # %bb.0: 846; SSSE3-NEXT: movdqa %xmm0, %xmm8 847; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 848; SSSE3-NEXT: pmuludq %xmm4, %xmm8 849; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 850; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 851; SSSE3-NEXT: pmuludq %xmm10, %xmm9 852; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 853; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 854; SSSE3-NEXT: pxor %xmm10, %xmm10 855; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 856; SSSE3-NEXT: pcmpeqd %xmm11, %xmm11 857; SSSE3-NEXT: pxor %xmm11, %xmm0 858; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 859; SSSE3-NEXT: pmuludq %xmm5, %xmm1 860; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 861; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 862; SSSE3-NEXT: pmuludq %xmm13, %xmm12 863; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 864; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 865; SSSE3-NEXT: pcmpeqd %xmm10, %xmm15 866; SSSE3-NEXT: pxor %xmm11, %xmm15 867; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 868; SSSE3-NEXT: pmuludq %xmm6, %xmm2 869; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 870; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 871; SSSE3-NEXT: pmuludq %xmm14, %xmm13 872; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 873; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 874; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 875; SSSE3-NEXT: pxor %xmm11, %xmm5 876; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 877; SSSE3-NEXT: pmuludq %xmm7, %xmm3 878; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 879; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 880; SSSE3-NEXT: pmuludq %xmm14, %xmm7 881; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 882; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 883; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 884; SSSE3-NEXT: pxor %xmm11, %xmm6 885; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 886; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 887; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 888; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 889; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 890; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 891; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 892; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 893; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 894; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 895; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 896; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 897; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 898; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 899; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 900; SSSE3-NEXT: movdqa %xmm8, (%rdi) 901; SSSE3-NEXT: movdqa %xmm15, %xmm1 902; SSSE3-NEXT: movdqa %xmm5, %xmm2 903; SSSE3-NEXT: movdqa %xmm6, %xmm3 904; SSSE3-NEXT: retq 905; 906; SSE41-LABEL: umulo_v16i32: 907; SSE41: # %bb.0: 908; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] 909; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] 910; SSE41-NEXT: pmuludq %xmm8, %xmm9 911; SSE41-NEXT: movdqa %xmm0, %xmm8 912; SSE41-NEXT: pmuludq %xmm4, %xmm8 913; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 914; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 915; SSE41-NEXT: pxor %xmm12, %xmm12 916; SSE41-NEXT: pcmpeqd %xmm12, %xmm8 917; SSE41-NEXT: pcmpeqd %xmm13, %xmm13 918; SSE41-NEXT: pxor %xmm13, %xmm8 919; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 920; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] 921; SSE41-NEXT: pmuludq %xmm9, %xmm10 922; SSE41-NEXT: movdqa %xmm1, %xmm9 923; SSE41-NEXT: pmuludq %xmm5, %xmm9 924; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] 925; SSE41-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7] 926; SSE41-NEXT: pcmpeqd %xmm12, %xmm9 927; SSE41-NEXT: pxor %xmm13, %xmm9 928; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] 929; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] 930; SSE41-NEXT: pmuludq %xmm10, %xmm11 931; SSE41-NEXT: movdqa %xmm2, %xmm10 932; SSE41-NEXT: pmuludq %xmm6, %xmm10 933; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] 934; SSE41-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] 935; SSE41-NEXT: pcmpeqd %xmm12, %xmm10 936; SSE41-NEXT: pxor %xmm13, %xmm10 937; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3] 938; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 939; SSE41-NEXT: pmuludq %xmm11, %xmm14 940; SSE41-NEXT: movdqa %xmm3, %xmm11 941; SSE41-NEXT: pmuludq %xmm7, %xmm11 942; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] 943; SSE41-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7] 944; SSE41-NEXT: pcmpeqd %xmm12, %xmm11 945; SSE41-NEXT: pxor %xmm13, %xmm11 946; SSE41-NEXT: pmulld %xmm4, %xmm0 947; SSE41-NEXT: pmulld %xmm5, %xmm1 948; SSE41-NEXT: pmulld %xmm6, %xmm2 949; SSE41-NEXT: pmulld %xmm7, %xmm3 950; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 951; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 952; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 953; SSE41-NEXT: movdqa %xmm0, (%rdi) 954; SSE41-NEXT: movdqa %xmm8, %xmm0 955; SSE41-NEXT: movdqa %xmm9, %xmm1 956; SSE41-NEXT: movdqa %xmm10, %xmm2 957; SSE41-NEXT: movdqa %xmm11, %xmm3 958; SSE41-NEXT: retq 959; 960; AVX1-LABEL: umulo_v16i32: 961; AVX1: # %bb.0: 962; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 963; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] 964; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 965; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] 966; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 967; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm7 968; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 969; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] 970; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 971; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7 972; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 973; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 974; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 975; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 976; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 977; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm6 978; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 979; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7] 980; AVX1-NEXT: vpcmpeqd %xmm4, %xmm8, %xmm4 981; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4 982; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm11 983; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 984; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 985; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 986; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 987; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 988; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm7 989; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 990; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 991; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 992; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm13 993; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 994; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 995; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 996; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm7 997; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 998; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 999; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 1000; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 1001; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 1002; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 1003; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 1004; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 1005; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 1006; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 1007; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 1008; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 1009; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1010; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1011; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 1012; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 1013; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 1014; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1015; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 1016; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) 1017; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 1018; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) 1019; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 1020; AVX1-NEXT: retq 1021; 1022; AVX2-LABEL: umulo_v16i32: 1023; AVX2: # %bb.0: 1024; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] 1025; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] 1026; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm4 1027; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 1028; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7] 1029; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] 1030; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 1031; AVX2-NEXT: vpcmpeqd %ymm5, %ymm4, %ymm4 1032; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 1033; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 1034; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 1035; AVX2-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 1036; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7] 1037; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7] 1038; AVX2-NEXT: vpmuludq %ymm7, %ymm8, %ymm7 1039; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm8 1040; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7] 1041; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] 1042; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 1043; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 1044; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1045; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 1046; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 1047; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 1048; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 1049; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 1050; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1 1051; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1052; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 1053; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 1054; AVX2-NEXT: retq 1055; 1056; AVX512-LABEL: umulo_v16i32: 1057; AVX512: # %bb.0: 1058; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 1059; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1060; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1061; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 1062; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 1063; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1064; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 1065; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 1066; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1067; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 1068; AVX512-NEXT: retq 1069 %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 1070 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 1071 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 1072 %res = sext <16 x i1> %obit to <16 x i32> 1073 store <16 x i32> %val, <16 x i32>* %p2 1074 ret <16 x i32> %res 1075} 1076 1077define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nounwind { 1078; SSE2-LABEL: umulo_v16i8: 1079; SSE2: # %bb.0: 1080; SSE2-NEXT: pxor %xmm2, %xmm2 1081; SSE2-NEXT: movdqa %xmm1, %xmm3 1082; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1083; SSE2-NEXT: movdqa %xmm0, %xmm5 1084; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1085; SSE2-NEXT: pmullw %xmm3, %xmm5 1086; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1087; SSE2-NEXT: movdqa %xmm5, %xmm3 1088; SSE2-NEXT: pand %xmm4, %xmm3 1089; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1090; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1091; SSE2-NEXT: pmullw %xmm1, %xmm0 1092; SSE2-NEXT: pand %xmm0, %xmm4 1093; SSE2-NEXT: packuswb %xmm3, %xmm4 1094; SSE2-NEXT: psrlw $8, %xmm5 1095; SSE2-NEXT: psrlw $8, %xmm0 1096; SSE2-NEXT: packuswb %xmm5, %xmm0 1097; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 1098; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1099; SSE2-NEXT: pxor %xmm2, %xmm3 1100; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1101; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1102; SSE2-NEXT: psrad $24, %xmm0 1103; SSE2-NEXT: movdqa %xmm3, %xmm1 1104; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1105; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1106; SSE2-NEXT: pslld $31, %xmm1 1107; SSE2-NEXT: psrad $31, %xmm1 1108; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1109; SSE2-NEXT: movdqa %xmm3, %xmm2 1110; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1111; SSE2-NEXT: pslld $31, %xmm2 1112; SSE2-NEXT: psrad $31, %xmm2 1113; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1114; SSE2-NEXT: pslld $31, %xmm3 1115; SSE2-NEXT: psrad $31, %xmm3 1116; SSE2-NEXT: movdqa %xmm4, (%rdi) 1117; SSE2-NEXT: retq 1118; 1119; SSSE3-LABEL: umulo_v16i8: 1120; SSSE3: # %bb.0: 1121; SSSE3-NEXT: pxor %xmm2, %xmm2 1122; SSSE3-NEXT: movdqa %xmm1, %xmm3 1123; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1124; SSSE3-NEXT: movdqa %xmm0, %xmm5 1125; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1126; SSSE3-NEXT: pmullw %xmm3, %xmm5 1127; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1128; SSSE3-NEXT: movdqa %xmm5, %xmm3 1129; SSSE3-NEXT: pand %xmm4, %xmm3 1130; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1131; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1132; SSSE3-NEXT: pmullw %xmm1, %xmm0 1133; SSSE3-NEXT: pand %xmm0, %xmm4 1134; SSSE3-NEXT: packuswb %xmm3, %xmm4 1135; SSSE3-NEXT: psrlw $8, %xmm5 1136; SSSE3-NEXT: psrlw $8, %xmm0 1137; SSSE3-NEXT: packuswb %xmm5, %xmm0 1138; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 1139; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1140; SSSE3-NEXT: pxor %xmm2, %xmm3 1141; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1142; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1143; SSSE3-NEXT: psrad $24, %xmm0 1144; SSSE3-NEXT: movdqa %xmm3, %xmm1 1145; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1146; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1147; SSSE3-NEXT: pslld $31, %xmm1 1148; SSSE3-NEXT: psrad $31, %xmm1 1149; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1150; SSSE3-NEXT: movdqa %xmm3, %xmm2 1151; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1152; SSSE3-NEXT: pslld $31, %xmm2 1153; SSSE3-NEXT: psrad $31, %xmm2 1154; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1155; SSSE3-NEXT: pslld $31, %xmm3 1156; SSSE3-NEXT: psrad $31, %xmm3 1157; SSSE3-NEXT: movdqa %xmm4, (%rdi) 1158; SSSE3-NEXT: retq 1159; 1160; SSE41-LABEL: umulo_v16i8: 1161; SSE41: # %bb.0: 1162; SSE41-NEXT: pxor %xmm2, %xmm2 1163; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1164; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1165; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1166; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1167; SSE41-NEXT: pmullw %xmm1, %xmm0 1168; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1169; SSE41-NEXT: movdqa %xmm0, %xmm1 1170; SSE41-NEXT: pand %xmm4, %xmm1 1171; SSE41-NEXT: pmullw %xmm3, %xmm5 1172; SSE41-NEXT: pand %xmm5, %xmm4 1173; SSE41-NEXT: packuswb %xmm1, %xmm4 1174; SSE41-NEXT: psrlw $8, %xmm0 1175; SSE41-NEXT: psrlw $8, %xmm5 1176; SSE41-NEXT: packuswb %xmm0, %xmm5 1177; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 1178; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 1179; SSE41-NEXT: pxor %xmm5, %xmm3 1180; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 1181; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 1182; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1183; SSE41-NEXT: pslld $31, %xmm1 1184; SSE41-NEXT: psrad $31, %xmm1 1185; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1186; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1187; SSE41-NEXT: pslld $31, %xmm2 1188; SSE41-NEXT: psrad $31, %xmm2 1189; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 1190; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1191; SSE41-NEXT: pslld $31, %xmm3 1192; SSE41-NEXT: psrad $31, %xmm3 1193; SSE41-NEXT: movdqa %xmm4, (%rdi) 1194; SSE41-NEXT: retq 1195; 1196; AVX1-LABEL: umulo_v16i8: 1197; AVX1: # %bb.0: 1198; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1199; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1200; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1201; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1202; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1203; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5 1204; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1205; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1206; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1207; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 1208; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm4 1209; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1210; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1211; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1212; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1213; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1214; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1215; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1216; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1217; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1218; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1219; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1220; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1221; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1222; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1223; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1224; AVX1-NEXT: vmovdqa %xmm4, (%rdi) 1225; AVX1-NEXT: retq 1226; 1227; AVX2-LABEL: umulo_v16i8: 1228; AVX2: # %bb.0: 1229; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1230; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1231; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1232; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1233; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1234; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm2 1235; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1236; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1237; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1238; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1239; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1240; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1241; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 1242; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1243; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1244; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1245; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1246; AVX2-NEXT: retq 1247; 1248; AVX512F-LABEL: umulo_v16i8: 1249; AVX512F: # %bb.0: 1250; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1251; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1252; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1253; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm0 1254; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1255; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 1256; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1257; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1258; AVX512F-NEXT: vpmovdb %zmm1, (%rdi) 1259; AVX512F-NEXT: retq 1260; 1261; AVX512BW-LABEL: umulo_v16i8: 1262; AVX512BW: # %bb.0: 1263; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1264; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1265; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1266; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm0 1267; AVX512BW-NEXT: vptestmw %ymm0, %ymm0, %k1 1268; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1269; AVX512BW-NEXT: vpmovwb %ymm1, (%rdi) 1270; AVX512BW-NEXT: retq 1271 %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1272 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 1273 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 1274 %res = sext <16 x i1> %obit to <16 x i32> 1275 store <16 x i8> %val, <16 x i8>* %p2 1276 ret <16 x i32> %res 1277} 1278 1279define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nounwind { 1280; SSE2-LABEL: umulo_v32i8: 1281; SSE2: # %bb.0: 1282; SSE2-NEXT: movq %rdi, %rax 1283; SSE2-NEXT: pxor %xmm5, %xmm5 1284; SSE2-NEXT: movdqa %xmm2, %xmm4 1285; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1286; SSE2-NEXT: movdqa %xmm0, %xmm6 1287; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1288; SSE2-NEXT: pmullw %xmm4, %xmm6 1289; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] 1290; SSE2-NEXT: movdqa %xmm6, %xmm7 1291; SSE2-NEXT: pand %xmm11, %xmm7 1292; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1293; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1294; SSE2-NEXT: pmullw %xmm2, %xmm0 1295; SSE2-NEXT: movdqa %xmm0, %xmm8 1296; SSE2-NEXT: pand %xmm11, %xmm8 1297; SSE2-NEXT: packuswb %xmm7, %xmm8 1298; SSE2-NEXT: movdqa %xmm3, %xmm7 1299; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1300; SSE2-NEXT: movdqa %xmm1, %xmm2 1301; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 1302; SSE2-NEXT: pmullw %xmm7, %xmm2 1303; SSE2-NEXT: movdqa %xmm2, %xmm7 1304; SSE2-NEXT: pand %xmm11, %xmm7 1305; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1306; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1307; SSE2-NEXT: pmullw %xmm3, %xmm1 1308; SSE2-NEXT: pand %xmm1, %xmm11 1309; SSE2-NEXT: packuswb %xmm7, %xmm11 1310; SSE2-NEXT: psrlw $8, %xmm2 1311; SSE2-NEXT: psrlw $8, %xmm1 1312; SSE2-NEXT: packuswb %xmm2, %xmm1 1313; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 1314; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1315; SSE2-NEXT: pxor %xmm2, %xmm1 1316; SSE2-NEXT: psrlw $8, %xmm6 1317; SSE2-NEXT: psrlw $8, %xmm0 1318; SSE2-NEXT: packuswb %xmm6, %xmm0 1319; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 1320; SSE2-NEXT: pxor %xmm2, %xmm0 1321; SSE2-NEXT: movdqa %xmm0, %xmm3 1322; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 1323; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1324; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1325; SSE2-NEXT: pslld $31, %xmm0 1326; SSE2-NEXT: psrad $31, %xmm0 1327; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1328; SSE2-NEXT: movdqa %xmm3, %xmm5 1329; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1330; SSE2-NEXT: pslld $31, %xmm5 1331; SSE2-NEXT: psrad $31, %xmm5 1332; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1333; SSE2-NEXT: pslld $31, %xmm3 1334; SSE2-NEXT: psrad $31, %xmm3 1335; SSE2-NEXT: movdqa %xmm1, %xmm6 1336; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1337; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1338; SSE2-NEXT: pslld $31, %xmm6 1339; SSE2-NEXT: psrad $31, %xmm6 1340; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1341; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1342; SSE2-NEXT: movdqa %xmm1, %xmm2 1343; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1344; SSE2-NEXT: pslld $31, %xmm2 1345; SSE2-NEXT: psrad $31, %xmm2 1346; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1347; SSE2-NEXT: pslld $31, %xmm1 1348; SSE2-NEXT: psrad $31, %xmm1 1349; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] 1350; SSE2-NEXT: psrad $24, %xmm7 1351; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 1352; SSE2-NEXT: psrad $24, %xmm4 1353; SSE2-NEXT: movdqa %xmm11, 16(%rsi) 1354; SSE2-NEXT: movdqa %xmm8, (%rsi) 1355; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 1356; SSE2-NEXT: movdqa %xmm7, (%rdi) 1357; SSE2-NEXT: movdqa %xmm1, 112(%rdi) 1358; SSE2-NEXT: movdqa %xmm2, 96(%rdi) 1359; SSE2-NEXT: movdqa %xmm6, 80(%rdi) 1360; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 1361; SSE2-NEXT: movdqa %xmm5, 32(%rdi) 1362; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1363; SSE2-NEXT: retq 1364; 1365; SSSE3-LABEL: umulo_v32i8: 1366; SSSE3: # %bb.0: 1367; SSSE3-NEXT: movq %rdi, %rax 1368; SSSE3-NEXT: pxor %xmm5, %xmm5 1369; SSSE3-NEXT: movdqa %xmm2, %xmm4 1370; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1371; SSSE3-NEXT: movdqa %xmm0, %xmm6 1372; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1373; SSSE3-NEXT: pmullw %xmm4, %xmm6 1374; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] 1375; SSSE3-NEXT: movdqa %xmm6, %xmm7 1376; SSSE3-NEXT: pand %xmm11, %xmm7 1377; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1378; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1379; SSSE3-NEXT: pmullw %xmm2, %xmm0 1380; SSSE3-NEXT: movdqa %xmm0, %xmm8 1381; SSSE3-NEXT: pand %xmm11, %xmm8 1382; SSSE3-NEXT: packuswb %xmm7, %xmm8 1383; SSSE3-NEXT: movdqa %xmm3, %xmm7 1384; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1385; SSSE3-NEXT: movdqa %xmm1, %xmm2 1386; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 1387; SSSE3-NEXT: pmullw %xmm7, %xmm2 1388; SSSE3-NEXT: movdqa %xmm2, %xmm7 1389; SSSE3-NEXT: pand %xmm11, %xmm7 1390; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1391; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1392; SSSE3-NEXT: pmullw %xmm3, %xmm1 1393; SSSE3-NEXT: pand %xmm1, %xmm11 1394; SSSE3-NEXT: packuswb %xmm7, %xmm11 1395; SSSE3-NEXT: psrlw $8, %xmm2 1396; SSSE3-NEXT: psrlw $8, %xmm1 1397; SSSE3-NEXT: packuswb %xmm2, %xmm1 1398; SSSE3-NEXT: pcmpeqb %xmm5, %xmm1 1399; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 1400; SSSE3-NEXT: pxor %xmm2, %xmm1 1401; SSSE3-NEXT: psrlw $8, %xmm6 1402; SSSE3-NEXT: psrlw $8, %xmm0 1403; SSSE3-NEXT: packuswb %xmm6, %xmm0 1404; SSSE3-NEXT: pcmpeqb %xmm5, %xmm0 1405; SSSE3-NEXT: pxor %xmm2, %xmm0 1406; SSSE3-NEXT: movdqa %xmm0, %xmm3 1407; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 1408; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1409; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1410; SSSE3-NEXT: pslld $31, %xmm0 1411; SSSE3-NEXT: psrad $31, %xmm0 1412; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1413; SSSE3-NEXT: movdqa %xmm3, %xmm5 1414; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1415; SSSE3-NEXT: pslld $31, %xmm5 1416; SSSE3-NEXT: psrad $31, %xmm5 1417; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1418; SSSE3-NEXT: pslld $31, %xmm3 1419; SSSE3-NEXT: psrad $31, %xmm3 1420; SSSE3-NEXT: movdqa %xmm1, %xmm6 1421; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1422; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1423; SSSE3-NEXT: pslld $31, %xmm6 1424; SSSE3-NEXT: psrad $31, %xmm6 1425; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1426; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1427; SSSE3-NEXT: movdqa %xmm1, %xmm2 1428; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1429; SSSE3-NEXT: pslld $31, %xmm2 1430; SSSE3-NEXT: psrad $31, %xmm2 1431; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1432; SSSE3-NEXT: pslld $31, %xmm1 1433; SSSE3-NEXT: psrad $31, %xmm1 1434; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] 1435; SSSE3-NEXT: psrad $24, %xmm7 1436; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 1437; SSSE3-NEXT: psrad $24, %xmm4 1438; SSSE3-NEXT: movdqa %xmm11, 16(%rsi) 1439; SSSE3-NEXT: movdqa %xmm8, (%rsi) 1440; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) 1441; SSSE3-NEXT: movdqa %xmm7, (%rdi) 1442; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) 1443; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) 1444; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) 1445; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 1446; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) 1447; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) 1448; SSSE3-NEXT: retq 1449; 1450; SSE41-LABEL: umulo_v32i8: 1451; SSE41: # %bb.0: 1452; SSE41-NEXT: movq %rdi, %rax 1453; SSE41-NEXT: pxor %xmm8, %xmm8 1454; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1455; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 1456; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1457; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1458; SSE41-NEXT: pmullw %xmm2, %xmm0 1459; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] 1460; SSE41-NEXT: movdqa %xmm0, %xmm6 1461; SSE41-NEXT: pand %xmm10, %xmm6 1462; SSE41-NEXT: pmullw %xmm5, %xmm4 1463; SSE41-NEXT: movdqa %xmm4, %xmm9 1464; SSE41-NEXT: pand %xmm10, %xmm9 1465; SSE41-NEXT: packuswb %xmm6, %xmm9 1466; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1467; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 1468; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1469; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1470; SSE41-NEXT: pmullw %xmm3, %xmm1 1471; SSE41-NEXT: movdqa %xmm1, %xmm3 1472; SSE41-NEXT: pand %xmm10, %xmm3 1473; SSE41-NEXT: pmullw %xmm7, %xmm6 1474; SSE41-NEXT: pand %xmm6, %xmm10 1475; SSE41-NEXT: packuswb %xmm3, %xmm10 1476; SSE41-NEXT: psrlw $8, %xmm1 1477; SSE41-NEXT: psrlw $8, %xmm6 1478; SSE41-NEXT: packuswb %xmm1, %xmm6 1479; SSE41-NEXT: pcmpeqb %xmm8, %xmm6 1480; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1481; SSE41-NEXT: pxor %xmm1, %xmm6 1482; SSE41-NEXT: psrlw $8, %xmm0 1483; SSE41-NEXT: psrlw $8, %xmm4 1484; SSE41-NEXT: packuswb %xmm0, %xmm4 1485; SSE41-NEXT: pcmpeqb %xmm8, %xmm4 1486; SSE41-NEXT: pxor %xmm1, %xmm4 1487; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 1488; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1489; SSE41-NEXT: pslld $31, %xmm0 1490; SSE41-NEXT: psrad $31, %xmm0 1491; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 1492; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1493; SSE41-NEXT: pslld $31, %xmm1 1494; SSE41-NEXT: psrad $31, %xmm1 1495; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] 1496; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1497; SSE41-NEXT: pslld $31, %xmm3 1498; SSE41-NEXT: psrad $31, %xmm3 1499; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] 1500; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 1501; SSE41-NEXT: pslld $31, %xmm7 1502; SSE41-NEXT: psrad $31, %xmm7 1503; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 1504; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 1505; SSE41-NEXT: pslld $31, %xmm5 1506; SSE41-NEXT: psrad $31, %xmm5 1507; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] 1508; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1509; SSE41-NEXT: pslld $31, %xmm2 1510; SSE41-NEXT: psrad $31, %xmm2 1511; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 1512; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 1513; SSE41-NEXT: movdqa %xmm10, 16(%rsi) 1514; SSE41-NEXT: movdqa %xmm9, (%rsi) 1515; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1516; SSE41-NEXT: movdqa %xmm4, (%rdi) 1517; SSE41-NEXT: movdqa %xmm2, 112(%rdi) 1518; SSE41-NEXT: movdqa %xmm5, 96(%rdi) 1519; SSE41-NEXT: movdqa %xmm7, 80(%rdi) 1520; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 1521; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 1522; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1523; SSE41-NEXT: retq 1524; 1525; AVX1-LABEL: umulo_v32i8: 1526; AVX1: # %bb.0: 1527; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1528; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1529; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1530; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1531; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1532; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm4 1533; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1534; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1535; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 1536; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm7 1537; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm8 1538; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1539; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1540; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1541; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1542; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 1543; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm7 1544; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1545; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1546; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1547; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm1 1548; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm5 1549; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 1550; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1551; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1552; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1553; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1554; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 1555; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm0 1556; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm3 1557; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 1558; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1559; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1560; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1561; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1562; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1563; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1564; AVX1-NEXT: vpmovsxbd %xmm4, %xmm2 1565; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] 1566; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1567; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1568; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1569; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1570; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1571; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1572; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1573; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] 1574; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1575; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] 1576; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 1577; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 1578; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) 1579; AVX1-NEXT: vmovdqa %xmm8, (%rdi) 1580; AVX1-NEXT: retq 1581; 1582; AVX2-LABEL: umulo_v32i8: 1583; AVX2: # %bb.0: 1584; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1585; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 1586; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1587; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 1588; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1589; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 1590; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 1591; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1592; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1593; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm1 1594; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm4 1595; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm1 1596; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1597; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1598; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 1599; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1600; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 1601; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1602; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1603; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 1604; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1605; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1606; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1607; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 1608; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1609; AVX2-NEXT: retq 1610; 1611; AVX512F-LABEL: umulo_v32i8: 1612; AVX512F: # %bb.0: 1613; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 1614; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 1615; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 1616; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 1617; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 1618; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm3 1619; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1620; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 1621; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1622; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1623; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm3 1624; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm0 1625; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1626; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 1627; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 1628; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1629; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1630; AVX512F-NEXT: vpmovdb %zmm2, 16(%rdi) 1631; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1632; AVX512F-NEXT: vpmovdb %zmm2, (%rdi) 1633; AVX512F-NEXT: retq 1634; 1635; AVX512BW-LABEL: umulo_v32i8: 1636; AVX512BW: # %bb.0: 1637; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1638; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1639; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 1640; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm0 1641; AVX512BW-NEXT: vptestmw %zmm0, %zmm0, %k1 1642; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1643; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 1644; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1645; AVX512BW-NEXT: vpmovwb %zmm2, (%rdi) 1646; AVX512BW-NEXT: retq 1647 %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1648 %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0 1649 %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1 1650 %res = sext <32 x i1> %obit to <32 x i32> 1651 store <32 x i8> %val, <32 x i8>* %p2 1652 ret <32 x i32> %res 1653} 1654 1655define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nounwind { 1656; SSE2-LABEL: umulo_v64i8: 1657; SSE2: # %bb.0: 1658; SSE2-NEXT: movq %rdi, %rax 1659; SSE2-NEXT: pxor %xmm9, %xmm9 1660; SSE2-NEXT: movdqa %xmm4, %xmm8 1661; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 1662; SSE2-NEXT: movdqa %xmm0, %xmm10 1663; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 1664; SSE2-NEXT: pmullw %xmm8, %xmm10 1665; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1666; SSE2-NEXT: movdqa %xmm10, %xmm12 1667; SSE2-NEXT: pand %xmm8, %xmm12 1668; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1669; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1670; SSE2-NEXT: pmullw %xmm4, %xmm0 1671; SSE2-NEXT: movdqa %xmm0, %xmm11 1672; SSE2-NEXT: pand %xmm8, %xmm11 1673; SSE2-NEXT: packuswb %xmm12, %xmm11 1674; SSE2-NEXT: movdqa %xmm5, %xmm4 1675; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1676; SSE2-NEXT: movdqa %xmm1, %xmm13 1677; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1678; SSE2-NEXT: pmullw %xmm4, %xmm13 1679; SSE2-NEXT: movdqa %xmm13, %xmm4 1680; SSE2-NEXT: pand %xmm8, %xmm4 1681; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 1682; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1683; SSE2-NEXT: pmullw %xmm5, %xmm1 1684; SSE2-NEXT: movdqa %xmm1, %xmm12 1685; SSE2-NEXT: pand %xmm8, %xmm12 1686; SSE2-NEXT: packuswb %xmm4, %xmm12 1687; SSE2-NEXT: movdqa %xmm6, %xmm4 1688; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1689; SSE2-NEXT: movdqa %xmm2, %xmm5 1690; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 1691; SSE2-NEXT: pmullw %xmm4, %xmm5 1692; SSE2-NEXT: movdqa %xmm5, %xmm4 1693; SSE2-NEXT: pand %xmm8, %xmm4 1694; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1695; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 1696; SSE2-NEXT: pmullw %xmm6, %xmm2 1697; SSE2-NEXT: movdqa %xmm2, %xmm14 1698; SSE2-NEXT: pand %xmm8, %xmm14 1699; SSE2-NEXT: packuswb %xmm4, %xmm14 1700; SSE2-NEXT: movdqa %xmm7, %xmm4 1701; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1702; SSE2-NEXT: movdqa %xmm3, %xmm6 1703; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 1704; SSE2-NEXT: pmullw %xmm4, %xmm6 1705; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1706; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 1707; SSE2-NEXT: pmullw %xmm7, %xmm3 1708; SSE2-NEXT: movdqa %xmm6, %xmm4 1709; SSE2-NEXT: pand %xmm8, %xmm4 1710; SSE2-NEXT: pand %xmm3, %xmm8 1711; SSE2-NEXT: packuswb %xmm4, %xmm8 1712; SSE2-NEXT: psrlw $8, %xmm6 1713; SSE2-NEXT: psrlw $8, %xmm3 1714; SSE2-NEXT: packuswb %xmm6, %xmm3 1715; SSE2-NEXT: psrlw $8, %xmm5 1716; SSE2-NEXT: psrlw $8, %xmm2 1717; SSE2-NEXT: packuswb %xmm5, %xmm2 1718; SSE2-NEXT: psrlw $8, %xmm13 1719; SSE2-NEXT: psrlw $8, %xmm1 1720; SSE2-NEXT: packuswb %xmm13, %xmm1 1721; SSE2-NEXT: psrlw $8, %xmm10 1722; SSE2-NEXT: psrlw $8, %xmm0 1723; SSE2-NEXT: packuswb %xmm10, %xmm0 1724; SSE2-NEXT: pcmpeqb %xmm9, %xmm3 1725; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 1726; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 1727; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 1728; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1729; SSE2-NEXT: pxor %xmm4, %xmm3 1730; SSE2-NEXT: pxor %xmm4, %xmm2 1731; SSE2-NEXT: pxor %xmm4, %xmm1 1732; SSE2-NEXT: pxor %xmm4, %xmm0 1733; SSE2-NEXT: movdqa %xmm8, 48(%rsi) 1734; SSE2-NEXT: movdqa %xmm14, 32(%rsi) 1735; SSE2-NEXT: movdqa %xmm12, 16(%rsi) 1736; SSE2-NEXT: movdqa %xmm3, %xmm4 1737; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1738; SSE2-NEXT: movdqa %xmm11, (%rsi) 1739; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1740; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1741; SSE2-NEXT: psrad $24, %xmm5 1742; SSE2-NEXT: movdqa %xmm5, 192(%rdi) 1743; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 1744; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1745; SSE2-NEXT: psrad $24, %xmm5 1746; SSE2-NEXT: movdqa %xmm5, 128(%rdi) 1747; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1748; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1749; SSE2-NEXT: psrad $24, %xmm5 1750; SSE2-NEXT: movdqa %xmm5, 64(%rdi) 1751; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 1752; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1753; SSE2-NEXT: psrad $24, %xmm5 1754; SSE2-NEXT: movdqa %xmm5, (%rdi) 1755; SSE2-NEXT: movdqa %xmm4, %xmm5 1756; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1757; SSE2-NEXT: pslld $31, %xmm4 1758; SSE2-NEXT: psrad $31, %xmm4 1759; SSE2-NEXT: movdqa %xmm4, 224(%rdi) 1760; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1761; SSE2-NEXT: pslld $31, %xmm5 1762; SSE2-NEXT: psrad $31, %xmm5 1763; SSE2-NEXT: movdqa %xmm5, 240(%rdi) 1764; SSE2-NEXT: movdqa %xmm2, %xmm4 1765; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1766; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1767; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1768; SSE2-NEXT: pslld $31, %xmm3 1769; SSE2-NEXT: psrad $31, %xmm3 1770; SSE2-NEXT: movdqa %xmm3, 208(%rdi) 1771; SSE2-NEXT: movdqa %xmm4, %xmm3 1772; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1773; SSE2-NEXT: pslld $31, %xmm4 1774; SSE2-NEXT: psrad $31, %xmm4 1775; SSE2-NEXT: movdqa %xmm4, 160(%rdi) 1776; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1777; SSE2-NEXT: pslld $31, %xmm3 1778; SSE2-NEXT: psrad $31, %xmm3 1779; SSE2-NEXT: movdqa %xmm3, 176(%rdi) 1780; SSE2-NEXT: movdqa %xmm1, %xmm3 1781; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1782; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1783; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1784; SSE2-NEXT: pslld $31, %xmm2 1785; SSE2-NEXT: psrad $31, %xmm2 1786; SSE2-NEXT: movdqa %xmm2, 144(%rdi) 1787; SSE2-NEXT: movdqa %xmm3, %xmm2 1788; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1789; SSE2-NEXT: pslld $31, %xmm3 1790; SSE2-NEXT: psrad $31, %xmm3 1791; SSE2-NEXT: movdqa %xmm3, 96(%rdi) 1792; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1793; SSE2-NEXT: pslld $31, %xmm2 1794; SSE2-NEXT: psrad $31, %xmm2 1795; SSE2-NEXT: movdqa %xmm2, 112(%rdi) 1796; SSE2-NEXT: movdqa %xmm0, %xmm2 1797; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1798; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1799; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1800; SSE2-NEXT: pslld $31, %xmm1 1801; SSE2-NEXT: psrad $31, %xmm1 1802; SSE2-NEXT: movdqa %xmm1, 80(%rdi) 1803; SSE2-NEXT: movdqa %xmm2, %xmm1 1804; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1805; SSE2-NEXT: pslld $31, %xmm2 1806; SSE2-NEXT: psrad $31, %xmm2 1807; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 1808; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1809; SSE2-NEXT: pslld $31, %xmm1 1810; SSE2-NEXT: psrad $31, %xmm1 1811; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1812; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1813; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1814; SSE2-NEXT: pslld $31, %xmm0 1815; SSE2-NEXT: psrad $31, %xmm0 1816; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1817; SSE2-NEXT: retq 1818; 1819; SSSE3-LABEL: umulo_v64i8: 1820; SSSE3: # %bb.0: 1821; SSSE3-NEXT: movq %rdi, %rax 1822; SSSE3-NEXT: pxor %xmm9, %xmm9 1823; SSSE3-NEXT: movdqa %xmm4, %xmm8 1824; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 1825; SSSE3-NEXT: movdqa %xmm0, %xmm10 1826; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 1827; SSSE3-NEXT: pmullw %xmm8, %xmm10 1828; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1829; SSSE3-NEXT: movdqa %xmm10, %xmm12 1830; SSSE3-NEXT: pand %xmm8, %xmm12 1831; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1832; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1833; SSSE3-NEXT: pmullw %xmm4, %xmm0 1834; SSSE3-NEXT: movdqa %xmm0, %xmm11 1835; SSSE3-NEXT: pand %xmm8, %xmm11 1836; SSSE3-NEXT: packuswb %xmm12, %xmm11 1837; SSSE3-NEXT: movdqa %xmm5, %xmm4 1838; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1839; SSSE3-NEXT: movdqa %xmm1, %xmm13 1840; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1841; SSSE3-NEXT: pmullw %xmm4, %xmm13 1842; SSSE3-NEXT: movdqa %xmm13, %xmm4 1843; SSSE3-NEXT: pand %xmm8, %xmm4 1844; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 1845; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1846; SSSE3-NEXT: pmullw %xmm5, %xmm1 1847; SSSE3-NEXT: movdqa %xmm1, %xmm12 1848; SSSE3-NEXT: pand %xmm8, %xmm12 1849; SSSE3-NEXT: packuswb %xmm4, %xmm12 1850; SSSE3-NEXT: movdqa %xmm6, %xmm4 1851; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1852; SSSE3-NEXT: movdqa %xmm2, %xmm5 1853; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 1854; SSSE3-NEXT: pmullw %xmm4, %xmm5 1855; SSSE3-NEXT: movdqa %xmm5, %xmm4 1856; SSSE3-NEXT: pand %xmm8, %xmm4 1857; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1858; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 1859; SSSE3-NEXT: pmullw %xmm6, %xmm2 1860; SSSE3-NEXT: movdqa %xmm2, %xmm14 1861; SSSE3-NEXT: pand %xmm8, %xmm14 1862; SSSE3-NEXT: packuswb %xmm4, %xmm14 1863; SSSE3-NEXT: movdqa %xmm7, %xmm4 1864; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1865; SSSE3-NEXT: movdqa %xmm3, %xmm6 1866; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 1867; SSSE3-NEXT: pmullw %xmm4, %xmm6 1868; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1869; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 1870; SSSE3-NEXT: pmullw %xmm7, %xmm3 1871; SSSE3-NEXT: movdqa %xmm6, %xmm4 1872; SSSE3-NEXT: pand %xmm8, %xmm4 1873; SSSE3-NEXT: pand %xmm3, %xmm8 1874; SSSE3-NEXT: packuswb %xmm4, %xmm8 1875; SSSE3-NEXT: psrlw $8, %xmm6 1876; SSSE3-NEXT: psrlw $8, %xmm3 1877; SSSE3-NEXT: packuswb %xmm6, %xmm3 1878; SSSE3-NEXT: psrlw $8, %xmm5 1879; SSSE3-NEXT: psrlw $8, %xmm2 1880; SSSE3-NEXT: packuswb %xmm5, %xmm2 1881; SSSE3-NEXT: psrlw $8, %xmm13 1882; SSSE3-NEXT: psrlw $8, %xmm1 1883; SSSE3-NEXT: packuswb %xmm13, %xmm1 1884; SSSE3-NEXT: psrlw $8, %xmm10 1885; SSSE3-NEXT: psrlw $8, %xmm0 1886; SSSE3-NEXT: packuswb %xmm10, %xmm0 1887; SSSE3-NEXT: pcmpeqb %xmm9, %xmm3 1888; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 1889; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 1890; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 1891; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 1892; SSSE3-NEXT: pxor %xmm4, %xmm3 1893; SSSE3-NEXT: pxor %xmm4, %xmm2 1894; SSSE3-NEXT: pxor %xmm4, %xmm1 1895; SSSE3-NEXT: pxor %xmm4, %xmm0 1896; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) 1897; SSSE3-NEXT: movdqa %xmm14, 32(%rsi) 1898; SSSE3-NEXT: movdqa %xmm12, 16(%rsi) 1899; SSSE3-NEXT: movdqa %xmm3, %xmm4 1900; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1901; SSSE3-NEXT: movdqa %xmm11, (%rsi) 1902; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1903; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1904; SSSE3-NEXT: psrad $24, %xmm5 1905; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) 1906; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 1907; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1908; SSSE3-NEXT: psrad $24, %xmm5 1909; SSSE3-NEXT: movdqa %xmm5, 128(%rdi) 1910; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1911; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1912; SSSE3-NEXT: psrad $24, %xmm5 1913; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) 1914; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 1915; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1916; SSSE3-NEXT: psrad $24, %xmm5 1917; SSSE3-NEXT: movdqa %xmm5, (%rdi) 1918; SSSE3-NEXT: movdqa %xmm4, %xmm5 1919; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1920; SSSE3-NEXT: pslld $31, %xmm4 1921; SSSE3-NEXT: psrad $31, %xmm4 1922; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) 1923; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1924; SSSE3-NEXT: pslld $31, %xmm5 1925; SSSE3-NEXT: psrad $31, %xmm5 1926; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) 1927; SSSE3-NEXT: movdqa %xmm2, %xmm4 1928; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1929; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1930; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1931; SSSE3-NEXT: pslld $31, %xmm3 1932; SSSE3-NEXT: psrad $31, %xmm3 1933; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) 1934; SSSE3-NEXT: movdqa %xmm4, %xmm3 1935; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1936; SSSE3-NEXT: pslld $31, %xmm4 1937; SSSE3-NEXT: psrad $31, %xmm4 1938; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) 1939; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1940; SSSE3-NEXT: pslld $31, %xmm3 1941; SSSE3-NEXT: psrad $31, %xmm3 1942; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) 1943; SSSE3-NEXT: movdqa %xmm1, %xmm3 1944; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1945; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1946; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1947; SSSE3-NEXT: pslld $31, %xmm2 1948; SSSE3-NEXT: psrad $31, %xmm2 1949; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) 1950; SSSE3-NEXT: movdqa %xmm3, %xmm2 1951; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1952; SSSE3-NEXT: pslld $31, %xmm3 1953; SSSE3-NEXT: psrad $31, %xmm3 1954; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) 1955; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1956; SSSE3-NEXT: pslld $31, %xmm2 1957; SSSE3-NEXT: psrad $31, %xmm2 1958; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) 1959; SSSE3-NEXT: movdqa %xmm0, %xmm2 1960; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1961; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1962; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1963; SSSE3-NEXT: pslld $31, %xmm1 1964; SSSE3-NEXT: psrad $31, %xmm1 1965; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) 1966; SSSE3-NEXT: movdqa %xmm2, %xmm1 1967; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1968; SSSE3-NEXT: pslld $31, %xmm2 1969; SSSE3-NEXT: psrad $31, %xmm2 1970; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 1971; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1972; SSSE3-NEXT: pslld $31, %xmm1 1973; SSSE3-NEXT: psrad $31, %xmm1 1974; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) 1975; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1976; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1977; SSSE3-NEXT: pslld $31, %xmm0 1978; SSSE3-NEXT: psrad $31, %xmm0 1979; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) 1980; SSSE3-NEXT: retq 1981; 1982; SSE41-LABEL: umulo_v64i8: 1983; SSE41: # %bb.0: 1984; SSE41-NEXT: movq %rdi, %rax 1985; SSE41-NEXT: pxor %xmm13, %xmm13 1986; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 1987; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] 1988; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1989; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 1990; SSE41-NEXT: pmullw %xmm4, %xmm0 1991; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1992; SSE41-NEXT: movdqa %xmm0, %xmm4 1993; SSE41-NEXT: pand %xmm9, %xmm4 1994; SSE41-NEXT: pmullw %xmm10, %xmm8 1995; SSE41-NEXT: movdqa %xmm8, %xmm10 1996; SSE41-NEXT: pand %xmm9, %xmm10 1997; SSE41-NEXT: packuswb %xmm4, %xmm10 1998; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 1999; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] 2000; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2001; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] 2002; SSE41-NEXT: pmullw %xmm5, %xmm1 2003; SSE41-NEXT: movdqa %xmm1, %xmm5 2004; SSE41-NEXT: pand %xmm9, %xmm5 2005; SSE41-NEXT: pmullw %xmm11, %xmm4 2006; SSE41-NEXT: movdqa %xmm4, %xmm11 2007; SSE41-NEXT: pand %xmm9, %xmm11 2008; SSE41-NEXT: packuswb %xmm5, %xmm11 2009; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 2010; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] 2011; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2012; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 2013; SSE41-NEXT: pmullw %xmm6, %xmm2 2014; SSE41-NEXT: movdqa %xmm2, %xmm6 2015; SSE41-NEXT: pand %xmm9, %xmm6 2016; SSE41-NEXT: pmullw %xmm12, %xmm5 2017; SSE41-NEXT: movdqa %xmm5, %xmm12 2018; SSE41-NEXT: pand %xmm9, %xmm12 2019; SSE41-NEXT: packuswb %xmm6, %xmm12 2020; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 2021; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] 2022; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2023; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 2024; SSE41-NEXT: pmullw %xmm7, %xmm3 2025; SSE41-NEXT: pmullw %xmm14, %xmm6 2026; SSE41-NEXT: movdqa %xmm3, %xmm7 2027; SSE41-NEXT: pand %xmm9, %xmm7 2028; SSE41-NEXT: pand %xmm6, %xmm9 2029; SSE41-NEXT: packuswb %xmm7, %xmm9 2030; SSE41-NEXT: psrlw $8, %xmm3 2031; SSE41-NEXT: psrlw $8, %xmm6 2032; SSE41-NEXT: packuswb %xmm3, %xmm6 2033; SSE41-NEXT: psrlw $8, %xmm2 2034; SSE41-NEXT: psrlw $8, %xmm5 2035; SSE41-NEXT: packuswb %xmm2, %xmm5 2036; SSE41-NEXT: psrlw $8, %xmm1 2037; SSE41-NEXT: psrlw $8, %xmm4 2038; SSE41-NEXT: packuswb %xmm1, %xmm4 2039; SSE41-NEXT: psrlw $8, %xmm0 2040; SSE41-NEXT: psrlw $8, %xmm8 2041; SSE41-NEXT: packuswb %xmm0, %xmm8 2042; SSE41-NEXT: pcmpeqb %xmm13, %xmm6 2043; SSE41-NEXT: pcmpeqb %xmm13, %xmm5 2044; SSE41-NEXT: pcmpeqb %xmm13, %xmm4 2045; SSE41-NEXT: pcmpeqb %xmm13, %xmm8 2046; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2047; SSE41-NEXT: pxor %xmm0, %xmm6 2048; SSE41-NEXT: pxor %xmm0, %xmm5 2049; SSE41-NEXT: pxor %xmm0, %xmm4 2050; SSE41-NEXT: pxor %xmm0, %xmm8 2051; SSE41-NEXT: movdqa %xmm9, 48(%rsi) 2052; SSE41-NEXT: movdqa %xmm12, 32(%rsi) 2053; SSE41-NEXT: movdqa %xmm11, 16(%rsi) 2054; SSE41-NEXT: movdqa %xmm10, (%rsi) 2055; SSE41-NEXT: pmovsxbd %xmm6, %xmm0 2056; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 2057; SSE41-NEXT: pmovsxbd %xmm5, %xmm0 2058; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 2059; SSE41-NEXT: pmovsxbd %xmm4, %xmm0 2060; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 2061; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 2062; SSE41-NEXT: movdqa %xmm0, (%rdi) 2063; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2064; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2065; SSE41-NEXT: pslld $31, %xmm0 2066; SSE41-NEXT: psrad $31, %xmm0 2067; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 2068; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2069; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2070; SSE41-NEXT: pslld $31, %xmm0 2071; SSE41-NEXT: psrad $31, %xmm0 2072; SSE41-NEXT: movdqa %xmm0, 240(%rdi) 2073; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2074; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2075; SSE41-NEXT: pslld $31, %xmm0 2076; SSE41-NEXT: psrad $31, %xmm0 2077; SSE41-NEXT: movdqa %xmm0, 208(%rdi) 2078; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2079; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2080; SSE41-NEXT: pslld $31, %xmm0 2081; SSE41-NEXT: psrad $31, %xmm0 2082; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 2083; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2084; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2085; SSE41-NEXT: pslld $31, %xmm0 2086; SSE41-NEXT: psrad $31, %xmm0 2087; SSE41-NEXT: movdqa %xmm0, 176(%rdi) 2088; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2089; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2090; SSE41-NEXT: pslld $31, %xmm0 2091; SSE41-NEXT: psrad $31, %xmm0 2092; SSE41-NEXT: movdqa %xmm0, 144(%rdi) 2093; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2094; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2095; SSE41-NEXT: pslld $31, %xmm0 2096; SSE41-NEXT: psrad $31, %xmm0 2097; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 2098; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 2099; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2100; SSE41-NEXT: pslld $31, %xmm0 2101; SSE41-NEXT: psrad $31, %xmm0 2102; SSE41-NEXT: movdqa %xmm0, 112(%rdi) 2103; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2104; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2105; SSE41-NEXT: pslld $31, %xmm0 2106; SSE41-NEXT: psrad $31, %xmm0 2107; SSE41-NEXT: movdqa %xmm0, 80(%rdi) 2108; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 2109; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2110; SSE41-NEXT: pslld $31, %xmm0 2111; SSE41-NEXT: psrad $31, %xmm0 2112; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 2113; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] 2114; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2115; SSE41-NEXT: pslld $31, %xmm0 2116; SSE41-NEXT: psrad $31, %xmm0 2117; SSE41-NEXT: movdqa %xmm0, 48(%rdi) 2118; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2119; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2120; SSE41-NEXT: pslld $31, %xmm0 2121; SSE41-NEXT: psrad $31, %xmm0 2122; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 2123; SSE41-NEXT: retq 2124; 2125; AVX1-LABEL: umulo_v64i8: 2126; AVX1: # %bb.0: 2127; AVX1-NEXT: movq %rdi, %rax 2128; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 2129; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2130; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2131; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm9 2132; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 2133; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm8 2134; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2135; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2136; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm11 2137; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm4 2138; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm8 2139; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2140; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2141; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2142; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2143; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm12 2144; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm7 2145; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2146; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2147; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm13 2148; AVX1-NEXT: vpand %xmm6, %xmm13, %xmm2 2149; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm10 2150; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2151; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2152; AVX1-NEXT: vpmullw %xmm2, %xmm7, %xmm7 2153; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 2154; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2155; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2156; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 2157; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm4 2158; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm14 2159; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2160; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2161; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2162; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2163; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 2164; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2165; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2166; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3 2167; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm1 2168; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm4 2169; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm15 2170; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2171; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2172; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2173; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm3 2174; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2175; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2176; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm3 2177; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm4 2178; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 2179; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm4 2180; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm6 2181; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 2182; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 2183; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 2184; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 2185; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm7 2186; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2187; AVX1-NEXT: vpxor %xmm1, %xmm2, %xmm6 2188; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 2189; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm5 2190; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm3 2191; AVX1-NEXT: vmovdqa %xmm15, 48(%rsi) 2192; AVX1-NEXT: vmovdqa %xmm14, 32(%rsi) 2193; AVX1-NEXT: vmovdqa %xmm10, 16(%rsi) 2194; AVX1-NEXT: vmovdqa %xmm8, (%rsi) 2195; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 2196; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) 2197; AVX1-NEXT: vpmovsxbd %xmm4, %xmm0 2198; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) 2199; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 2200; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) 2201; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 2202; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 2203; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2204; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2205; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) 2206; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2207; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2208; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) 2209; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2210; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2211; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) 2212; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2213; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2214; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) 2215; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 2216; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2217; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) 2218; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2219; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2220; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) 2221; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2222; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2223; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) 2224; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2225; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2226; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) 2227; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2228; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2229; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) 2230; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 2231; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2232; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) 2233; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 2234; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2235; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) 2236; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 2237; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2238; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) 2239; AVX1-NEXT: vzeroupper 2240; AVX1-NEXT: retq 2241; 2242; AVX2-LABEL: umulo_v64i8: 2243; AVX2: # %bb.0: 2244; AVX2-NEXT: movq %rdi, %rax 2245; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 2246; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] 2247; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] 2248; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 2249; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2250; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 2251; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] 2252; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] 2253; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2 2254; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm0 2255; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm9 2256; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] 2257; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] 2258; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2259; AVX2-NEXT: vpand %ymm6, %ymm7, %ymm8 2260; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] 2261; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] 2262; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 2263; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm3 2264; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm8 2265; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm6 2266; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2267; AVX2-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 2268; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 2269; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 2270; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 2271; AVX2-NEXT: vpsrlw $8, %ymm5, %ymm5 2272; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 2273; AVX2-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 2274; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 2275; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 2276; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] 2277; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 2278; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 2279; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 2280; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 2281; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 2282; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 2283; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 2284; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 2285; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 2286; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 2287; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 2288; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 2289; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 2290; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi) 2291; AVX2-NEXT: vmovdqa %ymm9, (%rsi) 2292; AVX2-NEXT: vmovdqa %ymm0, 192(%rdi) 2293; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi) 2294; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 2295; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 2296; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 2297; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) 2298; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) 2299; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi) 2300; AVX2-NEXT: vzeroupper 2301; AVX2-NEXT: retq 2302; 2303; AVX512F-LABEL: umulo_v64i8: 2304; AVX512F: # %bb.0: 2305; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2306; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 2307; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2308; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 2309; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 2310; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero 2311; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 2312; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 2313; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 2314; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 2315; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2316; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero 2317; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 2318; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 2319; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2320; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 2321; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 2322; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2323; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 2324; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2325; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 2326; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 2327; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2328; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k3 2329; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2330; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2331; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 2332; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm0 2333; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2334; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 2335; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} 2336; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} 2337; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 2338; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 2339; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 2340; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) 2341; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero 2342; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) 2343; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero 2344; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) 2345; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero 2346; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) 2347; AVX512F-NEXT: retq 2348; 2349; AVX512BW-LABEL: umulo_v64i8: 2350; AVX512BW: # %bb.0: 2351; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2352; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 2353; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] 2354; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 2355; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2356; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 2357; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 2358; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] 2359; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2360; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm1 2361; AVX512BW-NEXT: vpackuswb %zmm5, %zmm1, %zmm4 2362; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm1 2363; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 2364; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 2365; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1 2366; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 2367; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 2368; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} 2369; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 2370; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 2371; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 2372; AVX512BW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 2373; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) 2374; AVX512BW-NEXT: retq 2375 %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1) 2376 %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0 2377 %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1 2378 %res = sext <64 x i1> %obit to <64 x i32> 2379 store <64 x i8> %val, <64 x i8>* %p2 2380 ret <64 x i32> %res 2381} 2382 2383define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) nounwind { 2384; SSE2-LABEL: umulo_v8i16: 2385; SSE2: # %bb.0: 2386; SSE2-NEXT: movdqa %xmm0, %xmm2 2387; SSE2-NEXT: pmullw %xmm1, %xmm2 2388; SSE2-NEXT: pmulhuw %xmm0, %xmm1 2389; SSE2-NEXT: pxor %xmm0, %xmm0 2390; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 2391; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2392; SSE2-NEXT: pxor %xmm0, %xmm1 2393; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2394; SSE2-NEXT: psrad $16, %xmm0 2395; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2396; SSE2-NEXT: pslld $31, %xmm1 2397; SSE2-NEXT: psrad $31, %xmm1 2398; SSE2-NEXT: movdqa %xmm2, (%rdi) 2399; SSE2-NEXT: retq 2400; 2401; SSSE3-LABEL: umulo_v8i16: 2402; SSSE3: # %bb.0: 2403; SSSE3-NEXT: movdqa %xmm0, %xmm2 2404; SSSE3-NEXT: pmullw %xmm1, %xmm2 2405; SSSE3-NEXT: pmulhuw %xmm0, %xmm1 2406; SSSE3-NEXT: pxor %xmm0, %xmm0 2407; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1 2408; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 2409; SSSE3-NEXT: pxor %xmm0, %xmm1 2410; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2411; SSSE3-NEXT: psrad $16, %xmm0 2412; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2413; SSSE3-NEXT: pslld $31, %xmm1 2414; SSSE3-NEXT: psrad $31, %xmm1 2415; SSSE3-NEXT: movdqa %xmm2, (%rdi) 2416; SSSE3-NEXT: retq 2417; 2418; SSE41-LABEL: umulo_v8i16: 2419; SSE41: # %bb.0: 2420; SSE41-NEXT: movdqa %xmm0, %xmm2 2421; SSE41-NEXT: pmullw %xmm1, %xmm2 2422; SSE41-NEXT: pmulhuw %xmm0, %xmm1 2423; SSE41-NEXT: pxor %xmm0, %xmm0 2424; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 2425; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2426; SSE41-NEXT: pxor %xmm0, %xmm1 2427; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 2428; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2429; SSE41-NEXT: pslld $31, %xmm1 2430; SSE41-NEXT: psrad $31, %xmm1 2431; SSE41-NEXT: movdqa %xmm2, (%rdi) 2432; SSE41-NEXT: retq 2433; 2434; AVX1-LABEL: umulo_v8i16: 2435; AVX1: # %bb.0: 2436; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2437; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2438; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2439; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2440; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2441; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2442; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2443; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 2444; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2445; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2446; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 2447; AVX1-NEXT: retq 2448; 2449; AVX2-LABEL: umulo_v8i16: 2450; AVX2: # %bb.0: 2451; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2452; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2453; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2454; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2455; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2456; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2457; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 2458; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 2459; AVX2-NEXT: retq 2460; 2461; AVX512F-LABEL: umulo_v8i16: 2462; AVX512F: # %bb.0: 2463; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2464; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2465; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2466; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2467; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 2468; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2469; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 2470; AVX512F-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2471; AVX512F-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2472; AVX512F-NEXT: vmovdqa %xmm2, (%rdi) 2473; AVX512F-NEXT: retq 2474; 2475; AVX512BW-LABEL: umulo_v8i16: 2476; AVX512BW: # %bb.0: 2477; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2478; AVX512BW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2479; AVX512BW-NEXT: vptestmw %xmm0, %xmm0, %k1 2480; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2481; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2482; AVX512BW-NEXT: vmovdqa %xmm2, (%rdi) 2483; AVX512BW-NEXT: retq 2484 %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2485 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 2486 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 2487 %res = sext <8 x i1> %obit to <8 x i32> 2488 store <8 x i16> %val, <8 x i16>* %p2 2489 ret <8 x i32> %res 2490} 2491 2492define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) nounwind { 2493; SSE2-LABEL: umulo_v2i64: 2494; SSE2: # %bb.0: 2495; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2496; SSE2-NEXT: movq %xmm2, %r8 2497; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2498; SSE2-NEXT: movq %xmm2, %r10 2499; SSE2-NEXT: movq %xmm0, %rax 2500; SSE2-NEXT: movq %xmm1, %rdx 2501; SSE2-NEXT: xorl %ecx, %ecx 2502; SSE2-NEXT: mulq %rdx 2503; SSE2-NEXT: movq $-1, %r9 2504; SSE2-NEXT: movl $0, %esi 2505; SSE2-NEXT: cmovoq %r9, %rsi 2506; SSE2-NEXT: movq %rax, %xmm1 2507; SSE2-NEXT: movq %r8, %rax 2508; SSE2-NEXT: mulq %r10 2509; SSE2-NEXT: movq %rax, %xmm0 2510; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2511; SSE2-NEXT: movq %rsi, %xmm0 2512; SSE2-NEXT: cmovoq %r9, %rcx 2513; SSE2-NEXT: movq %rcx, %xmm2 2514; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2515; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2516; SSE2-NEXT: movdqa %xmm1, (%rdi) 2517; SSE2-NEXT: retq 2518; 2519; SSSE3-LABEL: umulo_v2i64: 2520; SSSE3: # %bb.0: 2521; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2522; SSSE3-NEXT: movq %xmm2, %r8 2523; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2524; SSSE3-NEXT: movq %xmm2, %r10 2525; SSSE3-NEXT: movq %xmm0, %rax 2526; SSSE3-NEXT: movq %xmm1, %rdx 2527; SSSE3-NEXT: xorl %ecx, %ecx 2528; SSSE3-NEXT: mulq %rdx 2529; SSSE3-NEXT: movq $-1, %r9 2530; SSSE3-NEXT: movl $0, %esi 2531; SSSE3-NEXT: cmovoq %r9, %rsi 2532; SSSE3-NEXT: movq %rax, %xmm1 2533; SSSE3-NEXT: movq %r8, %rax 2534; SSSE3-NEXT: mulq %r10 2535; SSSE3-NEXT: movq %rax, %xmm0 2536; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2537; SSSE3-NEXT: movq %rsi, %xmm0 2538; SSSE3-NEXT: cmovoq %r9, %rcx 2539; SSSE3-NEXT: movq %rcx, %xmm2 2540; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2541; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2542; SSSE3-NEXT: movdqa %xmm1, (%rdi) 2543; SSSE3-NEXT: retq 2544; 2545; SSE41-LABEL: umulo_v2i64: 2546; SSE41: # %bb.0: 2547; SSE41-NEXT: movq %xmm0, %r10 2548; SSE41-NEXT: movq %xmm1, %r8 2549; SSE41-NEXT: pextrq $1, %xmm0, %rax 2550; SSE41-NEXT: pextrq $1, %xmm1, %rdx 2551; SSE41-NEXT: xorl %esi, %esi 2552; SSE41-NEXT: mulq %rdx 2553; SSE41-NEXT: movq $-1, %r9 2554; SSE41-NEXT: movl $0, %ecx 2555; SSE41-NEXT: cmovoq %r9, %rcx 2556; SSE41-NEXT: movq %rax, %xmm0 2557; SSE41-NEXT: movq %r10, %rax 2558; SSE41-NEXT: mulq %r8 2559; SSE41-NEXT: movq %rax, %xmm1 2560; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2561; SSE41-NEXT: movq %rcx, %xmm0 2562; SSE41-NEXT: cmovoq %r9, %rsi 2563; SSE41-NEXT: movq %rsi, %xmm2 2564; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2565; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2566; SSE41-NEXT: movdqa %xmm1, (%rdi) 2567; SSE41-NEXT: retq 2568; 2569; AVX-LABEL: umulo_v2i64: 2570; AVX: # %bb.0: 2571; AVX-NEXT: vmovq %xmm0, %r10 2572; AVX-NEXT: vmovq %xmm1, %r8 2573; AVX-NEXT: vpextrq $1, %xmm0, %rax 2574; AVX-NEXT: vpextrq $1, %xmm1, %rdx 2575; AVX-NEXT: xorl %esi, %esi 2576; AVX-NEXT: mulq %rdx 2577; AVX-NEXT: movq $-1, %r9 2578; AVX-NEXT: movl $0, %ecx 2579; AVX-NEXT: cmovoq %r9, %rcx 2580; AVX-NEXT: vmovq %rax, %xmm0 2581; AVX-NEXT: movq %r10, %rax 2582; AVX-NEXT: mulq %r8 2583; AVX-NEXT: vmovq %rax, %xmm1 2584; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2585; AVX-NEXT: vmovq %rcx, %xmm0 2586; AVX-NEXT: cmovoq %r9, %rsi 2587; AVX-NEXT: vmovq %rsi, %xmm2 2588; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2589; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2590; AVX-NEXT: vmovdqa %xmm1, (%rdi) 2591; AVX-NEXT: retq 2592; 2593; AVX512F-LABEL: umulo_v2i64: 2594; AVX512F: # %bb.0: 2595; AVX512F-NEXT: vmovq %xmm0, %rcx 2596; AVX512F-NEXT: vmovq %xmm1, %rsi 2597; AVX512F-NEXT: vpextrq $1, %xmm0, %rax 2598; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx 2599; AVX512F-NEXT: mulq %rdx 2600; AVX512F-NEXT: seto %r8b 2601; AVX512F-NEXT: vmovq %rax, %xmm0 2602; AVX512F-NEXT: movq %rcx, %rax 2603; AVX512F-NEXT: mulq %rsi 2604; AVX512F-NEXT: vmovq %rax, %xmm1 2605; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2606; AVX512F-NEXT: seto %al 2607; AVX512F-NEXT: movw $-3, %cx 2608; AVX512F-NEXT: kmovw %ecx, %k0 2609; AVX512F-NEXT: kmovw %eax, %k1 2610; AVX512F-NEXT: kandw %k0, %k1, %k0 2611; AVX512F-NEXT: kmovw %r8d, %k1 2612; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2613; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2614; AVX512F-NEXT: korw %k1, %k0, %k1 2615; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2616; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2617; AVX512F-NEXT: vmovdqa %xmm1, (%rdi) 2618; AVX512F-NEXT: retq 2619; 2620; AVX512BW-LABEL: umulo_v2i64: 2621; AVX512BW: # %bb.0: 2622; AVX512BW-NEXT: vmovq %xmm0, %rcx 2623; AVX512BW-NEXT: vmovq %xmm1, %rsi 2624; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax 2625; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx 2626; AVX512BW-NEXT: mulq %rdx 2627; AVX512BW-NEXT: seto %r8b 2628; AVX512BW-NEXT: vmovq %rax, %xmm0 2629; AVX512BW-NEXT: movq %rcx, %rax 2630; AVX512BW-NEXT: mulq %rsi 2631; AVX512BW-NEXT: vmovq %rax, %xmm1 2632; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2633; AVX512BW-NEXT: seto %al 2634; AVX512BW-NEXT: movw $-3, %cx 2635; AVX512BW-NEXT: kmovd %ecx, %k0 2636; AVX512BW-NEXT: kmovd %eax, %k1 2637; AVX512BW-NEXT: kandw %k0, %k1, %k0 2638; AVX512BW-NEXT: kmovd %r8d, %k1 2639; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2640; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 2641; AVX512BW-NEXT: korw %k1, %k0, %k1 2642; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2643; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2644; AVX512BW-NEXT: vmovdqa %xmm1, (%rdi) 2645; AVX512BW-NEXT: retq 2646 %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 2647 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 2648 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 2649 %res = sext <2 x i1> %obit to <2 x i32> 2650 store <2 x i64> %val, <2 x i64>* %p2 2651 ret <2 x i32> %res 2652} 2653 2654define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, <4 x i24>* %p2) nounwind { 2655; SSE2-LABEL: umulo_v4i24: 2656; SSE2: # %bb.0: 2657; SSE2-NEXT: movdqa %xmm0, %xmm2 2658; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2659; SSE2-NEXT: pand %xmm0, %xmm1 2660; SSE2-NEXT: pand %xmm0, %xmm2 2661; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2662; SSE2-NEXT: pmuludq %xmm1, %xmm2 2663; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2664; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2665; SSE2-NEXT: pmuludq %xmm0, %xmm1 2666; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2667; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2668; SSE2-NEXT: pxor %xmm4, %xmm4 2669; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 2670; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2671; SSE2-NEXT: pxor %xmm3, %xmm5 2672; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2673; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2674; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2675; SSE2-NEXT: psrld $24, %xmm0 2676; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 2677; SSE2-NEXT: por %xmm5, %xmm0 2678; SSE2-NEXT: movd %xmm2, %eax 2679; SSE2-NEXT: movw %ax, (%rdi) 2680; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2681; SSE2-NEXT: movd %xmm2, %ecx 2682; SSE2-NEXT: movw %cx, 6(%rdi) 2683; SSE2-NEXT: movd %xmm1, %edx 2684; SSE2-NEXT: movw %dx, 3(%rdi) 2685; SSE2-NEXT: shrl $16, %eax 2686; SSE2-NEXT: movb %al, 2(%rdi) 2687; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2688; SSE2-NEXT: movd %xmm1, %eax 2689; SSE2-NEXT: movw %ax, 9(%rdi) 2690; SSE2-NEXT: shrl $16, %ecx 2691; SSE2-NEXT: movb %cl, 8(%rdi) 2692; SSE2-NEXT: shrl $16, %edx 2693; SSE2-NEXT: movb %dl, 5(%rdi) 2694; SSE2-NEXT: shrl $16, %eax 2695; SSE2-NEXT: movb %al, 11(%rdi) 2696; SSE2-NEXT: retq 2697; 2698; SSSE3-LABEL: umulo_v4i24: 2699; SSSE3: # %bb.0: 2700; SSSE3-NEXT: movdqa %xmm0, %xmm2 2701; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2702; SSSE3-NEXT: pand %xmm0, %xmm1 2703; SSSE3-NEXT: pand %xmm0, %xmm2 2704; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2705; SSSE3-NEXT: pmuludq %xmm1, %xmm2 2706; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2707; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2708; SSSE3-NEXT: pmuludq %xmm0, %xmm1 2709; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2710; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2711; SSSE3-NEXT: pxor %xmm4, %xmm4 2712; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 2713; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 2714; SSSE3-NEXT: pxor %xmm3, %xmm5 2715; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2716; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2717; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2718; SSSE3-NEXT: psrld $24, %xmm0 2719; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 2720; SSSE3-NEXT: por %xmm5, %xmm0 2721; SSSE3-NEXT: movd %xmm2, %eax 2722; SSSE3-NEXT: movw %ax, (%rdi) 2723; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2724; SSSE3-NEXT: movd %xmm2, %ecx 2725; SSSE3-NEXT: movw %cx, 6(%rdi) 2726; SSSE3-NEXT: movd %xmm1, %edx 2727; SSSE3-NEXT: movw %dx, 3(%rdi) 2728; SSSE3-NEXT: shrl $16, %eax 2729; SSSE3-NEXT: movb %al, 2(%rdi) 2730; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2731; SSSE3-NEXT: movd %xmm1, %eax 2732; SSSE3-NEXT: movw %ax, 9(%rdi) 2733; SSSE3-NEXT: shrl $16, %ecx 2734; SSSE3-NEXT: movb %cl, 8(%rdi) 2735; SSSE3-NEXT: shrl $16, %edx 2736; SSSE3-NEXT: movb %dl, 5(%rdi) 2737; SSSE3-NEXT: shrl $16, %eax 2738; SSSE3-NEXT: movb %al, 11(%rdi) 2739; SSSE3-NEXT: retq 2740; 2741; SSE41-LABEL: umulo_v4i24: 2742; SSE41: # %bb.0: 2743; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2744; SSE41-NEXT: pand %xmm2, %xmm0 2745; SSE41-NEXT: pand %xmm2, %xmm1 2746; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2747; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2748; SSE41-NEXT: pmuludq %xmm2, %xmm3 2749; SSE41-NEXT: movdqa %xmm0, %xmm2 2750; SSE41-NEXT: pmuludq %xmm1, %xmm2 2751; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2752; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 2753; SSE41-NEXT: pxor %xmm3, %xmm3 2754; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 2755; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 2756; SSE41-NEXT: pxor %xmm2, %xmm4 2757; SSE41-NEXT: pmulld %xmm0, %xmm1 2758; SSE41-NEXT: pextrd $3, %xmm1, %eax 2759; SSE41-NEXT: pextrd $2, %xmm1, %ecx 2760; SSE41-NEXT: pextrd $1, %xmm1, %edx 2761; SSE41-NEXT: movd %xmm1, %esi 2762; SSE41-NEXT: movdqa %xmm1, %xmm0 2763; SSE41-NEXT: psrld $24, %xmm0 2764; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 2765; SSE41-NEXT: por %xmm4, %xmm0 2766; SSE41-NEXT: movw %ax, 9(%rdi) 2767; SSE41-NEXT: movw %cx, 6(%rdi) 2768; SSE41-NEXT: movw %dx, 3(%rdi) 2769; SSE41-NEXT: movw %si, (%rdi) 2770; SSE41-NEXT: shrl $16, %eax 2771; SSE41-NEXT: movb %al, 11(%rdi) 2772; SSE41-NEXT: shrl $16, %ecx 2773; SSE41-NEXT: movb %cl, 8(%rdi) 2774; SSE41-NEXT: shrl $16, %edx 2775; SSE41-NEXT: movb %dl, 5(%rdi) 2776; SSE41-NEXT: shrl $16, %esi 2777; SSE41-NEXT: movb %sil, 2(%rdi) 2778; SSE41-NEXT: retq 2779; 2780; AVX1-LABEL: umulo_v4i24: 2781; AVX1: # %bb.0: 2782; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2783; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 2784; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 2785; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3] 2786; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3] 2787; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2788; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2789; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2790; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 2791; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2792; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2793; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2794; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 2795; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2796; AVX1-NEXT: vpsrld $24, %xmm1, %xmm0 2797; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2798; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 2799; AVX1-NEXT: vpextrd $3, %xmm1, %eax 2800; AVX1-NEXT: movw %ax, 9(%rdi) 2801; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 2802; AVX1-NEXT: movw %cx, 6(%rdi) 2803; AVX1-NEXT: vpextrd $1, %xmm1, %edx 2804; AVX1-NEXT: movw %dx, 3(%rdi) 2805; AVX1-NEXT: vmovd %xmm1, %esi 2806; AVX1-NEXT: movw %si, (%rdi) 2807; AVX1-NEXT: shrl $16, %eax 2808; AVX1-NEXT: movb %al, 11(%rdi) 2809; AVX1-NEXT: shrl $16, %ecx 2810; AVX1-NEXT: movb %cl, 8(%rdi) 2811; AVX1-NEXT: shrl $16, %edx 2812; AVX1-NEXT: movb %dl, 5(%rdi) 2813; AVX1-NEXT: shrl $16, %esi 2814; AVX1-NEXT: movb %sil, 2(%rdi) 2815; AVX1-NEXT: retq 2816; 2817; AVX2-LABEL: umulo_v4i24: 2818; AVX2: # %bb.0: 2819; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2820; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2821; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 2822; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2823; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2824; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2825; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2826; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2827; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 2828; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 2829; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2830; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2831; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 2832; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2833; AVX2-NEXT: vpsrld $24, %xmm1, %xmm0 2834; AVX2-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2835; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 2836; AVX2-NEXT: vpextrd $3, %xmm1, %eax 2837; AVX2-NEXT: movw %ax, 9(%rdi) 2838; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2839; AVX2-NEXT: movw %cx, 6(%rdi) 2840; AVX2-NEXT: vpextrd $1, %xmm1, %edx 2841; AVX2-NEXT: movw %dx, 3(%rdi) 2842; AVX2-NEXT: vmovd %xmm1, %esi 2843; AVX2-NEXT: movw %si, (%rdi) 2844; AVX2-NEXT: shrl $16, %eax 2845; AVX2-NEXT: movb %al, 11(%rdi) 2846; AVX2-NEXT: shrl $16, %ecx 2847; AVX2-NEXT: movb %cl, 8(%rdi) 2848; AVX2-NEXT: shrl $16, %edx 2849; AVX2-NEXT: movb %dl, 5(%rdi) 2850; AVX2-NEXT: shrl $16, %esi 2851; AVX2-NEXT: movb %sil, 2(%rdi) 2852; AVX2-NEXT: retq 2853; 2854; AVX512-LABEL: umulo_v4i24: 2855; AVX512: # %bb.0: 2856; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2857; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 2858; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 2859; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 2860; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2861; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2862; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 2863; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 2864; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 2865; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2866; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 2867; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0 2868; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 2869; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2870; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2871; AVX512-NEXT: vpextrd $3, %xmm1, %eax 2872; AVX512-NEXT: movw %ax, 9(%rdi) 2873; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 2874; AVX512-NEXT: movw %cx, 6(%rdi) 2875; AVX512-NEXT: vpextrd $1, %xmm1, %edx 2876; AVX512-NEXT: movw %dx, 3(%rdi) 2877; AVX512-NEXT: vmovd %xmm1, %esi 2878; AVX512-NEXT: movw %si, (%rdi) 2879; AVX512-NEXT: shrl $16, %eax 2880; AVX512-NEXT: movb %al, 11(%rdi) 2881; AVX512-NEXT: shrl $16, %ecx 2882; AVX512-NEXT: movb %cl, 8(%rdi) 2883; AVX512-NEXT: shrl $16, %edx 2884; AVX512-NEXT: movb %dl, 5(%rdi) 2885; AVX512-NEXT: shrl $16, %esi 2886; AVX512-NEXT: movb %sil, 2(%rdi) 2887; AVX512-NEXT: retq 2888 %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 2889 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 2890 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 2891 %res = sext <4 x i1> %obit to <4 x i32> 2892 store <4 x i24> %val, <4 x i24>* %p2 2893 ret <4 x i32> %res 2894} 2895 2896define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind { 2897; SSE-LABEL: umulo_v4i1: 2898; SSE: # %bb.0: 2899; SSE-NEXT: pand %xmm1, %xmm0 2900; SSE-NEXT: pslld $31, %xmm0 2901; SSE-NEXT: movmskps %xmm0, %eax 2902; SSE-NEXT: movb %al, (%rdi) 2903; SSE-NEXT: xorps %xmm0, %xmm0 2904; SSE-NEXT: retq 2905; 2906; AVX-LABEL: umulo_v4i1: 2907; AVX: # %bb.0: 2908; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 2909; AVX-NEXT: vpslld $31, %xmm0, %xmm0 2910; AVX-NEXT: vmovmskps %xmm0, %eax 2911; AVX-NEXT: movb %al, (%rdi) 2912; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 2913; AVX-NEXT: retq 2914; 2915; AVX512F-LABEL: umulo_v4i1: 2916; AVX512F: # %bb.0: 2917; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 2918; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2919; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k0 2920; AVX512F-NEXT: kmovw %k0, %eax 2921; AVX512F-NEXT: movb %al, (%rdi) 2922; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 2923; AVX512F-NEXT: retq 2924; 2925; AVX512BW-LABEL: umulo_v4i1: 2926; AVX512BW: # %bb.0: 2927; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 2928; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 2929; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0 2930; AVX512BW-NEXT: kmovd %k0, %eax 2931; AVX512BW-NEXT: movb %al, (%rdi) 2932; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 2933; AVX512BW-NEXT: retq 2934 %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 2935 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 2936 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 2937 %res = sext <4 x i1> %obit to <4 x i32> 2938 store <4 x i1> %val, <4 x i1>* %p2 2939 ret <4 x i32> %res 2940} 2941 2942define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, <2 x i128>* %p2) nounwind { 2943; SSE2-LABEL: umulo_v2i128: 2944; SSE2: # %bb.0: 2945; SSE2-NEXT: pushq %rbp 2946; SSE2-NEXT: pushq %r15 2947; SSE2-NEXT: pushq %r14 2948; SSE2-NEXT: pushq %r13 2949; SSE2-NEXT: pushq %r12 2950; SSE2-NEXT: pushq %rbx 2951; SSE2-NEXT: movq %r9, %r10 2952; SSE2-NEXT: movq %rcx, %r12 2953; SSE2-NEXT: movq %rdx, %r11 2954; SSE2-NEXT: movq %rsi, %rax 2955; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 2956; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 2957; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 2958; SSE2-NEXT: testq %r10, %r10 2959; SSE2-NEXT: setne %cl 2960; SSE2-NEXT: testq %rsi, %rsi 2961; SSE2-NEXT: setne %r13b 2962; SSE2-NEXT: andb %cl, %r13b 2963; SSE2-NEXT: mulq %r8 2964; SSE2-NEXT: movq %rax, %rsi 2965; SSE2-NEXT: seto %bpl 2966; SSE2-NEXT: movq %r10, %rax 2967; SSE2-NEXT: mulq %rdi 2968; SSE2-NEXT: movq %rax, %rcx 2969; SSE2-NEXT: seto %bl 2970; SSE2-NEXT: orb %bpl, %bl 2971; SSE2-NEXT: addq %rsi, %rcx 2972; SSE2-NEXT: movq %rdi, %rax 2973; SSE2-NEXT: mulq %r8 2974; SSE2-NEXT: movq %rax, %r8 2975; SSE2-NEXT: movq %rdx, %rsi 2976; SSE2-NEXT: addq %rcx, %rsi 2977; SSE2-NEXT: setb %cl 2978; SSE2-NEXT: orb %bl, %cl 2979; SSE2-NEXT: orb %r13b, %cl 2980; SSE2-NEXT: testq %r9, %r9 2981; SSE2-NEXT: setne %al 2982; SSE2-NEXT: testq %r12, %r12 2983; SSE2-NEXT: setne %r10b 2984; SSE2-NEXT: andb %al, %r10b 2985; SSE2-NEXT: movq %r12, %rax 2986; SSE2-NEXT: mulq %r15 2987; SSE2-NEXT: movq %rax, %rdi 2988; SSE2-NEXT: seto %bpl 2989; SSE2-NEXT: movq %r9, %rax 2990; SSE2-NEXT: mulq %r11 2991; SSE2-NEXT: movq %rax, %rbx 2992; SSE2-NEXT: seto %r9b 2993; SSE2-NEXT: orb %bpl, %r9b 2994; SSE2-NEXT: addq %rdi, %rbx 2995; SSE2-NEXT: movq %r11, %rax 2996; SSE2-NEXT: mulq %r15 2997; SSE2-NEXT: addq %rbx, %rdx 2998; SSE2-NEXT: setb %bl 2999; SSE2-NEXT: orb %r9b, %bl 3000; SSE2-NEXT: orb %r10b, %bl 3001; SSE2-NEXT: movzbl %bl, %edi 3002; SSE2-NEXT: negl %edi 3003; SSE2-NEXT: movd %edi, %xmm1 3004; SSE2-NEXT: movzbl %cl, %ecx 3005; SSE2-NEXT: negl %ecx 3006; SSE2-NEXT: movd %ecx, %xmm0 3007; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3008; SSE2-NEXT: movq %rax, 16(%r14) 3009; SSE2-NEXT: movq %r8, (%r14) 3010; SSE2-NEXT: movq %rdx, 24(%r14) 3011; SSE2-NEXT: movq %rsi, 8(%r14) 3012; SSE2-NEXT: popq %rbx 3013; SSE2-NEXT: popq %r12 3014; SSE2-NEXT: popq %r13 3015; SSE2-NEXT: popq %r14 3016; SSE2-NEXT: popq %r15 3017; SSE2-NEXT: popq %rbp 3018; SSE2-NEXT: retq 3019; 3020; SSSE3-LABEL: umulo_v2i128: 3021; SSSE3: # %bb.0: 3022; SSSE3-NEXT: pushq %rbp 3023; SSSE3-NEXT: pushq %r15 3024; SSSE3-NEXT: pushq %r14 3025; SSSE3-NEXT: pushq %r13 3026; SSSE3-NEXT: pushq %r12 3027; SSSE3-NEXT: pushq %rbx 3028; SSSE3-NEXT: movq %r9, %r10 3029; SSSE3-NEXT: movq %rcx, %r12 3030; SSSE3-NEXT: movq %rdx, %r11 3031; SSSE3-NEXT: movq %rsi, %rax 3032; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 3033; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 3034; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 3035; SSSE3-NEXT: testq %r10, %r10 3036; SSSE3-NEXT: setne %cl 3037; SSSE3-NEXT: testq %rsi, %rsi 3038; SSSE3-NEXT: setne %r13b 3039; SSSE3-NEXT: andb %cl, %r13b 3040; SSSE3-NEXT: mulq %r8 3041; SSSE3-NEXT: movq %rax, %rsi 3042; SSSE3-NEXT: seto %bpl 3043; SSSE3-NEXT: movq %r10, %rax 3044; SSSE3-NEXT: mulq %rdi 3045; SSSE3-NEXT: movq %rax, %rcx 3046; SSSE3-NEXT: seto %bl 3047; SSSE3-NEXT: orb %bpl, %bl 3048; SSSE3-NEXT: addq %rsi, %rcx 3049; SSSE3-NEXT: movq %rdi, %rax 3050; SSSE3-NEXT: mulq %r8 3051; SSSE3-NEXT: movq %rax, %r8 3052; SSSE3-NEXT: movq %rdx, %rsi 3053; SSSE3-NEXT: addq %rcx, %rsi 3054; SSSE3-NEXT: setb %cl 3055; SSSE3-NEXT: orb %bl, %cl 3056; SSSE3-NEXT: orb %r13b, %cl 3057; SSSE3-NEXT: testq %r9, %r9 3058; SSSE3-NEXT: setne %al 3059; SSSE3-NEXT: testq %r12, %r12 3060; SSSE3-NEXT: setne %r10b 3061; SSSE3-NEXT: andb %al, %r10b 3062; SSSE3-NEXT: movq %r12, %rax 3063; SSSE3-NEXT: mulq %r15 3064; SSSE3-NEXT: movq %rax, %rdi 3065; SSSE3-NEXT: seto %bpl 3066; SSSE3-NEXT: movq %r9, %rax 3067; SSSE3-NEXT: mulq %r11 3068; SSSE3-NEXT: movq %rax, %rbx 3069; SSSE3-NEXT: seto %r9b 3070; SSSE3-NEXT: orb %bpl, %r9b 3071; SSSE3-NEXT: addq %rdi, %rbx 3072; SSSE3-NEXT: movq %r11, %rax 3073; SSSE3-NEXT: mulq %r15 3074; SSSE3-NEXT: addq %rbx, %rdx 3075; SSSE3-NEXT: setb %bl 3076; SSSE3-NEXT: orb %r9b, %bl 3077; SSSE3-NEXT: orb %r10b, %bl 3078; SSSE3-NEXT: movzbl %bl, %edi 3079; SSSE3-NEXT: negl %edi 3080; SSSE3-NEXT: movd %edi, %xmm1 3081; SSSE3-NEXT: movzbl %cl, %ecx 3082; SSSE3-NEXT: negl %ecx 3083; SSSE3-NEXT: movd %ecx, %xmm0 3084; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3085; SSSE3-NEXT: movq %rax, 16(%r14) 3086; SSSE3-NEXT: movq %r8, (%r14) 3087; SSSE3-NEXT: movq %rdx, 24(%r14) 3088; SSSE3-NEXT: movq %rsi, 8(%r14) 3089; SSSE3-NEXT: popq %rbx 3090; SSSE3-NEXT: popq %r12 3091; SSSE3-NEXT: popq %r13 3092; SSSE3-NEXT: popq %r14 3093; SSSE3-NEXT: popq %r15 3094; SSSE3-NEXT: popq %rbp 3095; SSSE3-NEXT: retq 3096; 3097; SSE41-LABEL: umulo_v2i128: 3098; SSE41: # %bb.0: 3099; SSE41-NEXT: pushq %rbp 3100; SSE41-NEXT: pushq %r15 3101; SSE41-NEXT: pushq %r14 3102; SSE41-NEXT: pushq %r13 3103; SSE41-NEXT: pushq %r12 3104; SSE41-NEXT: pushq %rbx 3105; SSE41-NEXT: movq %r9, %r10 3106; SSE41-NEXT: movq %rcx, %r12 3107; SSE41-NEXT: movq %rdx, %r11 3108; SSE41-NEXT: movq %rsi, %rax 3109; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 3110; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 3111; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 3112; SSE41-NEXT: testq %r10, %r10 3113; SSE41-NEXT: setne %cl 3114; SSE41-NEXT: testq %rsi, %rsi 3115; SSE41-NEXT: setne %r13b 3116; SSE41-NEXT: andb %cl, %r13b 3117; SSE41-NEXT: mulq %r8 3118; SSE41-NEXT: movq %rax, %rsi 3119; SSE41-NEXT: seto %bpl 3120; SSE41-NEXT: movq %r10, %rax 3121; SSE41-NEXT: mulq %rdi 3122; SSE41-NEXT: movq %rax, %rcx 3123; SSE41-NEXT: seto %bl 3124; SSE41-NEXT: orb %bpl, %bl 3125; SSE41-NEXT: addq %rsi, %rcx 3126; SSE41-NEXT: movq %rdi, %rax 3127; SSE41-NEXT: mulq %r8 3128; SSE41-NEXT: movq %rax, %r8 3129; SSE41-NEXT: movq %rdx, %rsi 3130; SSE41-NEXT: addq %rcx, %rsi 3131; SSE41-NEXT: setb %cl 3132; SSE41-NEXT: orb %bl, %cl 3133; SSE41-NEXT: orb %r13b, %cl 3134; SSE41-NEXT: testq %r9, %r9 3135; SSE41-NEXT: setne %al 3136; SSE41-NEXT: testq %r12, %r12 3137; SSE41-NEXT: setne %r10b 3138; SSE41-NEXT: andb %al, %r10b 3139; SSE41-NEXT: movq %r12, %rax 3140; SSE41-NEXT: mulq %r15 3141; SSE41-NEXT: movq %rax, %rdi 3142; SSE41-NEXT: seto %bpl 3143; SSE41-NEXT: movq %r9, %rax 3144; SSE41-NEXT: mulq %r11 3145; SSE41-NEXT: movq %rax, %rbx 3146; SSE41-NEXT: seto %r9b 3147; SSE41-NEXT: orb %bpl, %r9b 3148; SSE41-NEXT: addq %rdi, %rbx 3149; SSE41-NEXT: movq %r11, %rax 3150; SSE41-NEXT: mulq %r15 3151; SSE41-NEXT: addq %rbx, %rdx 3152; SSE41-NEXT: setb %bl 3153; SSE41-NEXT: orb %r9b, %bl 3154; SSE41-NEXT: orb %r10b, %bl 3155; SSE41-NEXT: movzbl %bl, %edi 3156; SSE41-NEXT: negl %edi 3157; SSE41-NEXT: movzbl %cl, %ecx 3158; SSE41-NEXT: negl %ecx 3159; SSE41-NEXT: movd %ecx, %xmm0 3160; SSE41-NEXT: pinsrd $1, %edi, %xmm0 3161; SSE41-NEXT: movq %rax, 16(%r14) 3162; SSE41-NEXT: movq %r8, (%r14) 3163; SSE41-NEXT: movq %rdx, 24(%r14) 3164; SSE41-NEXT: movq %rsi, 8(%r14) 3165; SSE41-NEXT: popq %rbx 3166; SSE41-NEXT: popq %r12 3167; SSE41-NEXT: popq %r13 3168; SSE41-NEXT: popq %r14 3169; SSE41-NEXT: popq %r15 3170; SSE41-NEXT: popq %rbp 3171; SSE41-NEXT: retq 3172; 3173; AVX-LABEL: umulo_v2i128: 3174; AVX: # %bb.0: 3175; AVX-NEXT: pushq %rbp 3176; AVX-NEXT: pushq %r15 3177; AVX-NEXT: pushq %r14 3178; AVX-NEXT: pushq %r13 3179; AVX-NEXT: pushq %r12 3180; AVX-NEXT: pushq %rbx 3181; AVX-NEXT: movq %r9, %r10 3182; AVX-NEXT: movq %rcx, %r12 3183; AVX-NEXT: movq %rdx, %r11 3184; AVX-NEXT: movq %rsi, %rax 3185; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 3186; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 3187; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 3188; AVX-NEXT: testq %r10, %r10 3189; AVX-NEXT: setne %cl 3190; AVX-NEXT: testq %rsi, %rsi 3191; AVX-NEXT: setne %r13b 3192; AVX-NEXT: andb %cl, %r13b 3193; AVX-NEXT: mulq %r8 3194; AVX-NEXT: movq %rax, %rsi 3195; AVX-NEXT: seto %bpl 3196; AVX-NEXT: movq %r10, %rax 3197; AVX-NEXT: mulq %rdi 3198; AVX-NEXT: movq %rax, %rcx 3199; AVX-NEXT: seto %bl 3200; AVX-NEXT: orb %bpl, %bl 3201; AVX-NEXT: addq %rsi, %rcx 3202; AVX-NEXT: movq %rdi, %rax 3203; AVX-NEXT: mulq %r8 3204; AVX-NEXT: movq %rax, %r8 3205; AVX-NEXT: movq %rdx, %rsi 3206; AVX-NEXT: addq %rcx, %rsi 3207; AVX-NEXT: setb %cl 3208; AVX-NEXT: orb %bl, %cl 3209; AVX-NEXT: orb %r13b, %cl 3210; AVX-NEXT: testq %r9, %r9 3211; AVX-NEXT: setne %al 3212; AVX-NEXT: testq %r12, %r12 3213; AVX-NEXT: setne %r10b 3214; AVX-NEXT: andb %al, %r10b 3215; AVX-NEXT: movq %r12, %rax 3216; AVX-NEXT: mulq %r15 3217; AVX-NEXT: movq %rax, %rdi 3218; AVX-NEXT: seto %bpl 3219; AVX-NEXT: movq %r9, %rax 3220; AVX-NEXT: mulq %r11 3221; AVX-NEXT: movq %rax, %rbx 3222; AVX-NEXT: seto %r9b 3223; AVX-NEXT: orb %bpl, %r9b 3224; AVX-NEXT: addq %rdi, %rbx 3225; AVX-NEXT: movq %r11, %rax 3226; AVX-NEXT: mulq %r15 3227; AVX-NEXT: addq %rbx, %rdx 3228; AVX-NEXT: setb %bl 3229; AVX-NEXT: orb %r9b, %bl 3230; AVX-NEXT: orb %r10b, %bl 3231; AVX-NEXT: movzbl %bl, %edi 3232; AVX-NEXT: negl %edi 3233; AVX-NEXT: movzbl %cl, %ecx 3234; AVX-NEXT: negl %ecx 3235; AVX-NEXT: vmovd %ecx, %xmm0 3236; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 3237; AVX-NEXT: movq %rax, 16(%r14) 3238; AVX-NEXT: movq %r8, (%r14) 3239; AVX-NEXT: movq %rdx, 24(%r14) 3240; AVX-NEXT: movq %rsi, 8(%r14) 3241; AVX-NEXT: popq %rbx 3242; AVX-NEXT: popq %r12 3243; AVX-NEXT: popq %r13 3244; AVX-NEXT: popq %r14 3245; AVX-NEXT: popq %r15 3246; AVX-NEXT: popq %rbp 3247; AVX-NEXT: retq 3248; 3249; AVX512F-LABEL: umulo_v2i128: 3250; AVX512F: # %bb.0: 3251; AVX512F-NEXT: pushq %rbp 3252; AVX512F-NEXT: pushq %r15 3253; AVX512F-NEXT: pushq %r14 3254; AVX512F-NEXT: pushq %r13 3255; AVX512F-NEXT: pushq %r12 3256; AVX512F-NEXT: pushq %rbx 3257; AVX512F-NEXT: movq %rcx, %rax 3258; AVX512F-NEXT: movq %rdx, %r12 3259; AVX512F-NEXT: movq %rdi, %r11 3260; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 3261; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 3262; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 3263; AVX512F-NEXT: testq %r10, %r10 3264; AVX512F-NEXT: setne %dl 3265; AVX512F-NEXT: testq %rcx, %rcx 3266; AVX512F-NEXT: setne %r13b 3267; AVX512F-NEXT: andb %dl, %r13b 3268; AVX512F-NEXT: mulq %r15 3269; AVX512F-NEXT: movq %rax, %rdi 3270; AVX512F-NEXT: seto %bpl 3271; AVX512F-NEXT: movq %r10, %rax 3272; AVX512F-NEXT: mulq %r12 3273; AVX512F-NEXT: movq %rax, %rbx 3274; AVX512F-NEXT: seto %cl 3275; AVX512F-NEXT: orb %bpl, %cl 3276; AVX512F-NEXT: addq %rdi, %rbx 3277; AVX512F-NEXT: movq %r12, %rax 3278; AVX512F-NEXT: mulq %r15 3279; AVX512F-NEXT: movq %rax, %r10 3280; AVX512F-NEXT: movq %rdx, %r15 3281; AVX512F-NEXT: addq %rbx, %r15 3282; AVX512F-NEXT: setb %al 3283; AVX512F-NEXT: orb %cl, %al 3284; AVX512F-NEXT: orb %r13b, %al 3285; AVX512F-NEXT: kmovw %eax, %k0 3286; AVX512F-NEXT: testq %r9, %r9 3287; AVX512F-NEXT: setne %al 3288; AVX512F-NEXT: testq %rsi, %rsi 3289; AVX512F-NEXT: setne %cl 3290; AVX512F-NEXT: andb %al, %cl 3291; AVX512F-NEXT: movq %rsi, %rax 3292; AVX512F-NEXT: mulq %r8 3293; AVX512F-NEXT: movq %rax, %rsi 3294; AVX512F-NEXT: seto %bpl 3295; AVX512F-NEXT: movq %r9, %rax 3296; AVX512F-NEXT: mulq %r11 3297; AVX512F-NEXT: movq %rax, %rdi 3298; AVX512F-NEXT: seto %bl 3299; AVX512F-NEXT: orb %bpl, %bl 3300; AVX512F-NEXT: addq %rsi, %rdi 3301; AVX512F-NEXT: movq %r11, %rax 3302; AVX512F-NEXT: mulq %r8 3303; AVX512F-NEXT: addq %rdi, %rdx 3304; AVX512F-NEXT: setb %sil 3305; AVX512F-NEXT: orb %bl, %sil 3306; AVX512F-NEXT: orb %cl, %sil 3307; AVX512F-NEXT: andl $1, %esi 3308; AVX512F-NEXT: kmovw %esi, %k1 3309; AVX512F-NEXT: kshiftlw $1, %k0, %k0 3310; AVX512F-NEXT: korw %k0, %k1, %k1 3311; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3312; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3313; AVX512F-NEXT: movq %r10, 16(%r14) 3314; AVX512F-NEXT: movq %rax, (%r14) 3315; AVX512F-NEXT: movq %r15, 24(%r14) 3316; AVX512F-NEXT: movq %rdx, 8(%r14) 3317; AVX512F-NEXT: popq %rbx 3318; AVX512F-NEXT: popq %r12 3319; AVX512F-NEXT: popq %r13 3320; AVX512F-NEXT: popq %r14 3321; AVX512F-NEXT: popq %r15 3322; AVX512F-NEXT: popq %rbp 3323; AVX512F-NEXT: retq 3324; 3325; AVX512BW-LABEL: umulo_v2i128: 3326; AVX512BW: # %bb.0: 3327; AVX512BW-NEXT: pushq %rbp 3328; AVX512BW-NEXT: pushq %r15 3329; AVX512BW-NEXT: pushq %r14 3330; AVX512BW-NEXT: pushq %r13 3331; AVX512BW-NEXT: pushq %r12 3332; AVX512BW-NEXT: pushq %rbx 3333; AVX512BW-NEXT: movq %rcx, %rax 3334; AVX512BW-NEXT: movq %rdx, %r12 3335; AVX512BW-NEXT: movq %rdi, %r11 3336; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 3337; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 3338; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3339; AVX512BW-NEXT: testq %r10, %r10 3340; AVX512BW-NEXT: setne %dl 3341; AVX512BW-NEXT: testq %rcx, %rcx 3342; AVX512BW-NEXT: setne %r13b 3343; AVX512BW-NEXT: andb %dl, %r13b 3344; AVX512BW-NEXT: mulq %r15 3345; AVX512BW-NEXT: movq %rax, %rdi 3346; AVX512BW-NEXT: seto %bpl 3347; AVX512BW-NEXT: movq %r10, %rax 3348; AVX512BW-NEXT: mulq %r12 3349; AVX512BW-NEXT: movq %rax, %rbx 3350; AVX512BW-NEXT: seto %cl 3351; AVX512BW-NEXT: orb %bpl, %cl 3352; AVX512BW-NEXT: addq %rdi, %rbx 3353; AVX512BW-NEXT: movq %r12, %rax 3354; AVX512BW-NEXT: mulq %r15 3355; AVX512BW-NEXT: movq %rax, %r10 3356; AVX512BW-NEXT: movq %rdx, %r15 3357; AVX512BW-NEXT: addq %rbx, %r15 3358; AVX512BW-NEXT: setb %al 3359; AVX512BW-NEXT: orb %cl, %al 3360; AVX512BW-NEXT: orb %r13b, %al 3361; AVX512BW-NEXT: kmovd %eax, %k0 3362; AVX512BW-NEXT: testq %r9, %r9 3363; AVX512BW-NEXT: setne %al 3364; AVX512BW-NEXT: testq %rsi, %rsi 3365; AVX512BW-NEXT: setne %cl 3366; AVX512BW-NEXT: andb %al, %cl 3367; AVX512BW-NEXT: movq %rsi, %rax 3368; AVX512BW-NEXT: mulq %r8 3369; AVX512BW-NEXT: movq %rax, %rsi 3370; AVX512BW-NEXT: seto %bpl 3371; AVX512BW-NEXT: movq %r9, %rax 3372; AVX512BW-NEXT: mulq %r11 3373; AVX512BW-NEXT: movq %rax, %rdi 3374; AVX512BW-NEXT: seto %bl 3375; AVX512BW-NEXT: orb %bpl, %bl 3376; AVX512BW-NEXT: addq %rsi, %rdi 3377; AVX512BW-NEXT: movq %r11, %rax 3378; AVX512BW-NEXT: mulq %r8 3379; AVX512BW-NEXT: addq %rdi, %rdx 3380; AVX512BW-NEXT: setb %sil 3381; AVX512BW-NEXT: orb %bl, %sil 3382; AVX512BW-NEXT: orb %cl, %sil 3383; AVX512BW-NEXT: andl $1, %esi 3384; AVX512BW-NEXT: kmovw %esi, %k1 3385; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 3386; AVX512BW-NEXT: korw %k0, %k1, %k1 3387; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3388; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3389; AVX512BW-NEXT: movq %r10, 16(%r14) 3390; AVX512BW-NEXT: movq %rax, (%r14) 3391; AVX512BW-NEXT: movq %r15, 24(%r14) 3392; AVX512BW-NEXT: movq %rdx, 8(%r14) 3393; AVX512BW-NEXT: popq %rbx 3394; AVX512BW-NEXT: popq %r12 3395; AVX512BW-NEXT: popq %r13 3396; AVX512BW-NEXT: popq %r14 3397; AVX512BW-NEXT: popq %r15 3398; AVX512BW-NEXT: popq %rbp 3399; AVX512BW-NEXT: retq 3400 %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 3401 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 3402 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 3403 %res = sext <2 x i1> %obit to <2 x i32> 3404 store <2 x i128> %val, <2 x i128>* %p2 3405 ret <2 x i32> %res 3406} 3407