1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512F 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512BW 11 12declare {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32>, <1 x i32>) 13declare {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32>, <2 x i32>) 14declare {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32>, <3 x i32>) 15declare {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32>, <4 x i32>) 16declare {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32>, <6 x i32>) 17declare {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32>, <8 x i32>) 18declare {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32>, <16 x i32>) 19 20declare {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8>, <16 x i8>) 21declare {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8>, <32 x i8>) 22declare {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8>, <64 x i8>) 23declare {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16>, <8 x i16>) 24declare {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64>, <2 x i64>) 25 26declare {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24>, <4 x i24>) 27declare {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1>, <4 x i1>) 28declare {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128>, <2 x i128>) 29 30define <1 x i32> @umulo_v1i32(<1 x i32> %a0, <1 x i32> %a1, ptr %p2) nounwind { 31; CHECK-LABEL: umulo_v1i32: 32; CHECK: # %bb.0: 33; CHECK-NEXT: movq %rdx, %rcx 34; CHECK-NEXT: movl %edi, %eax 35; CHECK-NEXT: xorl %edi, %edi 36; CHECK-NEXT: mull %esi 37; CHECK-NEXT: seto %dil 38; CHECK-NEXT: negl %edi 39; CHECK-NEXT: movl %eax, (%rcx) 40; CHECK-NEXT: movl %edi, %eax 41; CHECK-NEXT: retq 42 %t = call {<1 x i32>, <1 x i1>} @llvm.umul.with.overflow.v1i32(<1 x i32> %a0, <1 x i32> %a1) 43 %val = extractvalue {<1 x i32>, <1 x i1>} %t, 0 44 %obit = extractvalue {<1 x i32>, <1 x i1>} %t, 1 45 %res = sext <1 x i1> %obit to <1 x i32> 46 store <1 x i32> %val, ptr %p2 47 ret <1 x i32> %res 48} 49 50define <2 x i32> @umulo_v2i32(<2 x i32> %a0, <2 x i32> %a1, ptr %p2) nounwind { 51; SSE2-LABEL: umulo_v2i32: 52; SSE2: # %bb.0: 53; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 54; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 55; SSE2-NEXT: pmuludq %xmm1, %xmm2 56; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 57; SSE2-NEXT: pxor %xmm1, %xmm1 58; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 59; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 60; SSE2-NEXT: pxor %xmm1, %xmm0 61; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 62; SSE2-NEXT: movq %xmm1, (%rdi) 63; SSE2-NEXT: retq 64; 65; SSSE3-LABEL: umulo_v2i32: 66; SSSE3: # %bb.0: 67; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 68; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 69; SSSE3-NEXT: pmuludq %xmm1, %xmm2 70; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 71; SSSE3-NEXT: pxor %xmm1, %xmm1 72; SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 73; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 74; SSSE3-NEXT: pxor %xmm1, %xmm0 75; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 76; SSSE3-NEXT: movq %xmm1, (%rdi) 77; SSSE3-NEXT: retq 78; 79; SSE41-LABEL: umulo_v2i32: 80; SSE41: # %bb.0: 81; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 82; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 83; SSE41-NEXT: pmuludq %xmm1, %xmm2 84; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,3,2,3] 85; SSE41-NEXT: pxor %xmm1, %xmm1 86; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 87; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 88; SSE41-NEXT: pxor %xmm1, %xmm0 89; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 90; SSE41-NEXT: movq %xmm1, (%rdi) 91; SSE41-NEXT: retq 92; 93; AVX-LABEL: umulo_v2i32: 94; AVX: # %bb.0: 95; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 96; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 97; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm1 98; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 99; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 100; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 101; AVX-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 102; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 103; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 104; AVX-NEXT: vmovq %xmm1, (%rdi) 105; AVX-NEXT: retq 106; 107; AVX512-LABEL: umulo_v2i32: 108; AVX512: # %bb.0: 109; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 110; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 111; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 112; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 113; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] 114; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 115; AVX512-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 116; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 117; AVX512-NEXT: vmovq %xmm1, (%rdi) 118; AVX512-NEXT: retq 119 %t = call {<2 x i32>, <2 x i1>} @llvm.umul.with.overflow.v2i32(<2 x i32> %a0, <2 x i32> %a1) 120 %val = extractvalue {<2 x i32>, <2 x i1>} %t, 0 121 %obit = extractvalue {<2 x i32>, <2 x i1>} %t, 1 122 %res = sext <2 x i1> %obit to <2 x i32> 123 store <2 x i32> %val, ptr %p2 124 ret <2 x i32> %res 125} 126 127define <3 x i32> @umulo_v3i32(<3 x i32> %a0, <3 x i32> %a1, ptr %p2) nounwind { 128; SSE2-LABEL: umulo_v3i32: 129; SSE2: # %bb.0: 130; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 131; SSE2-NEXT: pmuludq %xmm1, %xmm0 132; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 133; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 134; SSE2-NEXT: pmuludq %xmm2, %xmm4 135; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 136; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 137; SSE2-NEXT: pxor %xmm2, %xmm2 138; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 139; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 140; SSE2-NEXT: pxor %xmm2, %xmm1 141; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 142; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 143; SSE2-NEXT: movd %xmm2, 8(%rdi) 144; SSE2-NEXT: movq %xmm0, (%rdi) 145; SSE2-NEXT: movdqa %xmm1, %xmm0 146; SSE2-NEXT: retq 147; 148; SSSE3-LABEL: umulo_v3i32: 149; SSSE3: # %bb.0: 150; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 151; SSSE3-NEXT: pmuludq %xmm1, %xmm0 152; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 153; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 154; SSSE3-NEXT: pmuludq %xmm2, %xmm4 155; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 156; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 157; SSSE3-NEXT: pxor %xmm2, %xmm2 158; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 159; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 160; SSSE3-NEXT: pxor %xmm2, %xmm1 161; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 162; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 163; SSSE3-NEXT: movd %xmm2, 8(%rdi) 164; SSSE3-NEXT: movq %xmm0, (%rdi) 165; SSSE3-NEXT: movdqa %xmm1, %xmm0 166; SSSE3-NEXT: retq 167; 168; SSE41-LABEL: umulo_v3i32: 169; SSE41: # %bb.0: 170; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 171; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 172; SSE41-NEXT: pmuludq %xmm2, %xmm3 173; SSE41-NEXT: movdqa %xmm0, %xmm2 174; SSE41-NEXT: pmuludq %xmm1, %xmm2 175; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 176; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 177; SSE41-NEXT: pxor %xmm3, %xmm3 178; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 179; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 180; SSE41-NEXT: pxor %xmm3, %xmm2 181; SSE41-NEXT: pmulld %xmm1, %xmm0 182; SSE41-NEXT: pextrd $2, %xmm0, 8(%rdi) 183; SSE41-NEXT: movq %xmm0, (%rdi) 184; SSE41-NEXT: movdqa %xmm2, %xmm0 185; SSE41-NEXT: retq 186; 187; AVX1-LABEL: umulo_v3i32: 188; AVX1: # %bb.0: 189; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 190; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 191; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 192; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 193; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 194; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 195; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 196; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 197; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 198; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 199; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 200; AVX1-NEXT: vpextrd $2, %xmm0, 8(%rdi) 201; AVX1-NEXT: vmovq %xmm0, (%rdi) 202; AVX1-NEXT: vmovdqa %xmm2, %xmm0 203; AVX1-NEXT: retq 204; 205; AVX2-LABEL: umulo_v3i32: 206; AVX2: # %bb.0: 207; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 208; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 209; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 210; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 211; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 212; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 213; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 214; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 215; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 216; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 217; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 218; AVX2-NEXT: vpextrd $2, %xmm0, 8(%rdi) 219; AVX2-NEXT: vmovq %xmm0, (%rdi) 220; AVX2-NEXT: vmovdqa %xmm2, %xmm0 221; AVX2-NEXT: retq 222; 223; AVX512-LABEL: umulo_v3i32: 224; AVX512: # %bb.0: 225; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 226; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 227; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 228; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 229; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 230; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 231; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 232; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 233; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 234; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 235; AVX512-NEXT: vpextrd $2, %xmm1, 8(%rdi) 236; AVX512-NEXT: vmovq %xmm1, (%rdi) 237; AVX512-NEXT: retq 238 %t = call {<3 x i32>, <3 x i1>} @llvm.umul.with.overflow.v3i32(<3 x i32> %a0, <3 x i32> %a1) 239 %val = extractvalue {<3 x i32>, <3 x i1>} %t, 0 240 %obit = extractvalue {<3 x i32>, <3 x i1>} %t, 1 241 %res = sext <3 x i1> %obit to <3 x i32> 242 store <3 x i32> %val, ptr %p2 243 ret <3 x i32> %res 244} 245 246define <4 x i32> @umulo_v4i32(<4 x i32> %a0, <4 x i32> %a1, ptr %p2) nounwind { 247; SSE2-LABEL: umulo_v4i32: 248; SSE2: # %bb.0: 249; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 250; SSE2-NEXT: pmuludq %xmm1, %xmm0 251; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 252; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 253; SSE2-NEXT: pmuludq %xmm2, %xmm4 254; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 255; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 256; SSE2-NEXT: pxor %xmm2, %xmm2 257; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 258; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 259; SSE2-NEXT: pxor %xmm2, %xmm1 260; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 261; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 262; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 263; SSE2-NEXT: movdqa %xmm0, (%rdi) 264; SSE2-NEXT: movdqa %xmm1, %xmm0 265; SSE2-NEXT: retq 266; 267; SSSE3-LABEL: umulo_v4i32: 268; SSSE3: # %bb.0: 269; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 270; SSSE3-NEXT: pmuludq %xmm1, %xmm0 271; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3] 272; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 273; SSSE3-NEXT: pmuludq %xmm2, %xmm4 274; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,3,2,3] 275; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 276; SSSE3-NEXT: pxor %xmm2, %xmm2 277; SSSE3-NEXT: pcmpeqd %xmm3, %xmm2 278; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 279; SSSE3-NEXT: pxor %xmm2, %xmm1 280; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 281; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 282; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 283; SSSE3-NEXT: movdqa %xmm0, (%rdi) 284; SSSE3-NEXT: movdqa %xmm1, %xmm0 285; SSSE3-NEXT: retq 286; 287; SSE41-LABEL: umulo_v4i32: 288; SSE41: # %bb.0: 289; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 290; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 291; SSE41-NEXT: pmuludq %xmm2, %xmm3 292; SSE41-NEXT: movdqa %xmm0, %xmm2 293; SSE41-NEXT: pmuludq %xmm1, %xmm2 294; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 295; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 296; SSE41-NEXT: pxor %xmm3, %xmm3 297; SSE41-NEXT: pcmpeqd %xmm2, %xmm3 298; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 299; SSE41-NEXT: pxor %xmm3, %xmm2 300; SSE41-NEXT: pmulld %xmm1, %xmm0 301; SSE41-NEXT: movdqa %xmm0, (%rdi) 302; SSE41-NEXT: movdqa %xmm2, %xmm0 303; SSE41-NEXT: retq 304; 305; AVX1-LABEL: umulo_v4i32: 306; AVX1: # %bb.0: 307; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 308; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 309; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 310; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 311; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 312; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 313; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 314; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 315; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 316; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2 317; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 318; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 319; AVX1-NEXT: vmovdqa %xmm2, %xmm0 320; AVX1-NEXT: retq 321; 322; AVX2-LABEL: umulo_v4i32: 323; AVX2: # %bb.0: 324; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 325; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 326; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 327; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 328; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 329; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 330; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 331; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 332; AVX2-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 333; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 334; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 335; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 336; AVX2-NEXT: vmovdqa %xmm2, %xmm0 337; AVX2-NEXT: retq 338; 339; AVX512-LABEL: umulo_v4i32: 340; AVX512: # %bb.0: 341; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 342; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 343; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 344; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 345; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 346; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 347; AVX512-NEXT: vptestmd %xmm4, %xmm4, %k1 348; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 349; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 350; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 351; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 352; AVX512-NEXT: retq 353 %t = call {<4 x i32>, <4 x i1>} @llvm.umul.with.overflow.v4i32(<4 x i32> %a0, <4 x i32> %a1) 354 %val = extractvalue {<4 x i32>, <4 x i1>} %t, 0 355 %obit = extractvalue {<4 x i32>, <4 x i1>} %t, 1 356 %res = sext <4 x i1> %obit to <4 x i32> 357 store <4 x i32> %val, ptr %p2 358 ret <4 x i32> %res 359} 360 361define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind { 362; SSE2-LABEL: umulo_v6i32: 363; SSE2: # %bb.0: 364; SSE2-NEXT: movq %rdi, %rax 365; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 366; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 367; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 368; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 369; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 370; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 371; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 372; SSE2-NEXT: movd %r8d, %xmm0 373; SSE2-NEXT: movd %ecx, %xmm1 374; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 375; SSE2-NEXT: movd %edx, %xmm0 376; SSE2-NEXT: movd %esi, %xmm3 377; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 378; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 379; SSE2-NEXT: movd %r9d, %xmm1 380; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx 381; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 382; SSE2-NEXT: pmuludq %xmm1, %xmm0 383; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 384; SSE2-NEXT: pmuludq %xmm2, %xmm3 385; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 386; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 387; SSE2-NEXT: pmuludq %xmm4, %xmm2 388; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 389; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 390; SSE2-NEXT: pxor %xmm4, %xmm4 391; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 392; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 393; SSE2-NEXT: pxor %xmm5, %xmm1 394; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 395; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 396; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 397; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 398; SSE2-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 399; SSE2-NEXT: pmuludq %xmm2, %xmm6 400; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 401; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 402; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 403; SSE2-NEXT: pcmpeqd %xmm4, %xmm7 404; SSE2-NEXT: pxor %xmm5, %xmm7 405; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 406; SSE2-NEXT: movq %xmm0, 16(%rcx) 407; SSE2-NEXT: movdqa %xmm3, (%rcx) 408; SSE2-NEXT: movq %xmm7, 16(%rdi) 409; SSE2-NEXT: movdqa %xmm1, (%rdi) 410; SSE2-NEXT: retq 411; 412; SSSE3-LABEL: umulo_v6i32: 413; SSSE3: # %bb.0: 414; SSSE3-NEXT: movq %rdi, %rax 415; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 416; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 417; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 418; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 419; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 420; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 421; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] 422; SSSE3-NEXT: movd %r8d, %xmm0 423; SSSE3-NEXT: movd %ecx, %xmm1 424; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 425; SSSE3-NEXT: movd %edx, %xmm0 426; SSSE3-NEXT: movd %esi, %xmm3 427; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 428; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0] 429; SSSE3-NEXT: movd %r9d, %xmm1 430; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %rcx 431; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 432; SSSE3-NEXT: pmuludq %xmm1, %xmm0 433; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] 434; SSSE3-NEXT: pmuludq %xmm2, %xmm3 435; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] 436; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 437; SSSE3-NEXT: pmuludq %xmm4, %xmm2 438; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 439; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 440; SSSE3-NEXT: pxor %xmm4, %xmm4 441; SSSE3-NEXT: pcmpeqd %xmm4, %xmm1 442; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 443; SSSE3-NEXT: pxor %xmm5, %xmm1 444; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 445; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 446; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 447; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 448; SSSE3-NEXT: movd {{.*#+}} xmm6 = mem[0],zero,zero,zero 449; SSSE3-NEXT: pmuludq %xmm2, %xmm6 450; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 451; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,3,2,3] 452; SSSE3-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 453; SSSE3-NEXT: pcmpeqd %xmm4, %xmm7 454; SSSE3-NEXT: pxor %xmm5, %xmm7 455; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] 456; SSSE3-NEXT: movq %xmm0, 16(%rcx) 457; SSSE3-NEXT: movdqa %xmm3, (%rcx) 458; SSSE3-NEXT: movq %xmm7, 16(%rdi) 459; SSSE3-NEXT: movdqa %xmm1, (%rdi) 460; SSSE3-NEXT: retq 461; 462; SSE41-LABEL: umulo_v6i32: 463; SSE41: # %bb.0: 464; SSE41-NEXT: movq %rdi, %rax 465; SSE41-NEXT: movd %esi, %xmm2 466; SSE41-NEXT: pinsrd $1, %edx, %xmm2 467; SSE41-NEXT: pinsrd $2, %ecx, %xmm2 468; SSE41-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 469; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm1 470; SSE41-NEXT: pinsrd $2, {{[0-9]+}}(%rsp), %xmm1 471; SSE41-NEXT: movdqa %xmm1, %xmm0 472; SSE41-NEXT: pmuludq %xmm2, %xmm1 473; SSE41-NEXT: pinsrd $3, %r8d, %xmm2 474; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 475; SSE41-NEXT: movd %r9d, %xmm4 476; SSE41-NEXT: movdqa %xmm4, %xmm5 477; SSE41-NEXT: pmuludq %xmm3, %xmm4 478; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm3 479; SSE41-NEXT: pinsrd $1, {{[0-9]+}}(%rsp), %xmm5 480; SSE41-NEXT: pmulld %xmm3, %xmm5 481; SSE41-NEXT: pinsrd $3, {{[0-9]+}}(%rsp), %xmm0 482; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %rcx 483; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] 484; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] 485; SSE41-NEXT: pmuludq %xmm3, %xmm6 486; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 487; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5],xmm6[6,7] 488; SSE41-NEXT: pxor %xmm8, %xmm8 489; SSE41-NEXT: pcmpeqd %xmm8, %xmm1 490; SSE41-NEXT: pcmpeqd %xmm6, %xmm6 491; SSE41-NEXT: pxor %xmm6, %xmm1 492; SSE41-NEXT: movd {{.*#+}} xmm7 = mem[0],zero,zero,zero 493; SSE41-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero 494; SSE41-NEXT: pmuludq %xmm7, %xmm3 495; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 496; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] 497; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 498; SSE41-NEXT: pxor %xmm6, %xmm4 499; SSE41-NEXT: pmulld %xmm2, %xmm0 500; SSE41-NEXT: movq %xmm5, 16(%rcx) 501; SSE41-NEXT: movdqa %xmm0, (%rcx) 502; SSE41-NEXT: movq %xmm4, 16(%rdi) 503; SSE41-NEXT: movdqa %xmm1, (%rdi) 504; SSE41-NEXT: retq 505; 506; AVX1-LABEL: umulo_v6i32: 507; AVX1: # %bb.0: 508; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 509; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 510; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 511; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 512; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 513; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 514; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 515; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 516; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 517; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 518; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 519; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 520; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 521; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 522; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 523; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 524; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 525; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 526; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 527; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 528; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 529; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 530; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 531; AVX1-NEXT: vmovq %xmm1, 16(%rdi) 532; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 533; AVX1-NEXT: vmovaps %ymm2, %ymm0 534; AVX1-NEXT: retq 535; 536; AVX2-LABEL: umulo_v6i32: 537; AVX2: # %bb.0: 538; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 539; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 540; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 541; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 542; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 543; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 544; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 545; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 546; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 547; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 548; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 549; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 550; AVX2-NEXT: vmovq %xmm1, 16(%rdi) 551; AVX2-NEXT: vmovdqa %xmm0, (%rdi) 552; AVX2-NEXT: vmovdqa %ymm2, %ymm0 553; AVX2-NEXT: retq 554; 555; AVX512-LABEL: umulo_v6i32: 556; AVX512: # %bb.0: 557; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 558; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 559; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 560; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 561; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 562; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 563; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 564; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 565; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 566; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 567; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 568; AVX512-NEXT: vmovq %xmm2, 16(%rdi) 569; AVX512-NEXT: vmovdqa %xmm1, (%rdi) 570; AVX512-NEXT: retq 571 %t = call {<6 x i32>, <6 x i1>} @llvm.umul.with.overflow.v6i32(<6 x i32> %a0, <6 x i32> %a1) 572 %val = extractvalue {<6 x i32>, <6 x i1>} %t, 0 573 %obit = extractvalue {<6 x i32>, <6 x i1>} %t, 1 574 %res = sext <6 x i1> %obit to <6 x i32> 575 store <6 x i32> %val, ptr %p2 576 ret <6 x i32> %res 577} 578 579define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind { 580; SSE2-LABEL: umulo_v8i32: 581; SSE2: # %bb.0: 582; SSE2-NEXT: movdqa %xmm0, %xmm4 583; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 584; SSE2-NEXT: pmuludq %xmm2, %xmm4 585; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 586; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 587; SSE2-NEXT: pmuludq %xmm5, %xmm6 588; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 589; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 590; SSE2-NEXT: pxor %xmm8, %xmm8 591; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 592; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 593; SSE2-NEXT: pxor %xmm7, %xmm0 594; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 595; SSE2-NEXT: pmuludq %xmm3, %xmm1 596; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 597; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 598; SSE2-NEXT: pmuludq %xmm5, %xmm3 599; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 600; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 601; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 602; SSE2-NEXT: pxor %xmm7, %xmm2 603; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 604; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 605; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 606; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 607; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 608; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 609; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 610; SSE2-NEXT: movdqa %xmm4, (%rdi) 611; SSE2-NEXT: movdqa %xmm2, %xmm1 612; SSE2-NEXT: retq 613; 614; SSSE3-LABEL: umulo_v8i32: 615; SSSE3: # %bb.0: 616; SSSE3-NEXT: movdqa %xmm0, %xmm4 617; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 618; SSSE3-NEXT: pmuludq %xmm2, %xmm4 619; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,3,2,3] 620; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] 621; SSSE3-NEXT: pmuludq %xmm5, %xmm6 622; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,3,2,3] 623; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 624; SSSE3-NEXT: pxor %xmm8, %xmm8 625; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 626; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 627; SSSE3-NEXT: pxor %xmm7, %xmm0 628; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 629; SSSE3-NEXT: pmuludq %xmm3, %xmm1 630; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3] 631; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 632; SSSE3-NEXT: pmuludq %xmm5, %xmm3 633; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3] 634; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 635; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 636; SSSE3-NEXT: pxor %xmm7, %xmm2 637; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 638; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3] 639; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 640; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 641; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 642; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 643; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 644; SSSE3-NEXT: movdqa %xmm4, (%rdi) 645; SSSE3-NEXT: movdqa %xmm2, %xmm1 646; SSSE3-NEXT: retq 647; 648; SSE41-LABEL: umulo_v8i32: 649; SSE41: # %bb.0: 650; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] 651; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 652; SSE41-NEXT: pmuludq %xmm4, %xmm5 653; SSE41-NEXT: movdqa %xmm0, %xmm4 654; SSE41-NEXT: pmuludq %xmm2, %xmm4 655; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] 656; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5],xmm5[6,7] 657; SSE41-NEXT: pxor %xmm8, %xmm8 658; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 659; SSE41-NEXT: pcmpeqd %xmm7, %xmm7 660; SSE41-NEXT: pxor %xmm7, %xmm4 661; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] 662; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] 663; SSE41-NEXT: pmuludq %xmm5, %xmm6 664; SSE41-NEXT: movdqa %xmm1, %xmm5 665; SSE41-NEXT: pmuludq %xmm3, %xmm5 666; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 667; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 668; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 669; SSE41-NEXT: pxor %xmm7, %xmm5 670; SSE41-NEXT: pmulld %xmm2, %xmm0 671; SSE41-NEXT: pmulld %xmm3, %xmm1 672; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 673; SSE41-NEXT: movdqa %xmm0, (%rdi) 674; SSE41-NEXT: movdqa %xmm4, %xmm0 675; SSE41-NEXT: movdqa %xmm5, %xmm1 676; SSE41-NEXT: retq 677; 678; AVX1-LABEL: umulo_v8i32: 679; AVX1: # %bb.0: 680; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 681; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] 682; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 683; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 684; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 685; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5 686; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] 687; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7] 688; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 689; AVX1-NEXT: vpcmpeqd %xmm2, %xmm8, %xmm2 690; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 691; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2 692; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] 693; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 694; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 695; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm7 696; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 697; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 698; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 699; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5 700; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 701; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 702; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1 703; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi) 704; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 705; AVX1-NEXT: vmovaps %ymm2, %ymm0 706; AVX1-NEXT: retq 707; 708; AVX2-LABEL: umulo_v8i32: 709; AVX2: # %bb.0: 710; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[1,1,3,3,5,5,7,7] 711; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7] 712; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2 713; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm3 714; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,1,3,3,5,5,7,7] 715; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] 716; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 717; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 718; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 719; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 720; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 721; AVX2-NEXT: vmovdqa %ymm0, (%rdi) 722; AVX2-NEXT: vmovdqa %ymm2, %ymm0 723; AVX2-NEXT: retq 724; 725; AVX512-LABEL: umulo_v8i32: 726; AVX512: # %bb.0: 727; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 728; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] 729; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[1,1,3,3,5,5,7,7] 730; AVX512-NEXT: vpmuludq %ymm3, %ymm4, %ymm3 731; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [1,9,3,11,5,13,7,15] 732; AVX512-NEXT: vpermi2d %ymm3, %ymm2, %ymm4 733; AVX512-NEXT: vptestmd %ymm4, %ymm4, %k1 734; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm1 735; AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 736; AVX512-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 737; AVX512-NEXT: vmovdqa %ymm1, (%rdi) 738; AVX512-NEXT: retq 739 %t = call {<8 x i32>, <8 x i1>} @llvm.umul.with.overflow.v8i32(<8 x i32> %a0, <8 x i32> %a1) 740 %val = extractvalue {<8 x i32>, <8 x i1>} %t, 0 741 %obit = extractvalue {<8 x i32>, <8 x i1>} %t, 1 742 %res = sext <8 x i1> %obit to <8 x i32> 743 store <8 x i32> %val, ptr %p2 744 ret <8 x i32> %res 745} 746 747define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwind { 748; SSE2-LABEL: umulo_v16i32: 749; SSE2: # %bb.0: 750; SSE2-NEXT: movdqa %xmm0, %xmm8 751; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 752; SSE2-NEXT: pmuludq %xmm4, %xmm8 753; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 754; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 755; SSE2-NEXT: pmuludq %xmm10, %xmm9 756; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 757; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 758; SSE2-NEXT: pxor %xmm10, %xmm10 759; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 760; SSE2-NEXT: pcmpeqd %xmm11, %xmm11 761; SSE2-NEXT: pxor %xmm11, %xmm0 762; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 763; SSE2-NEXT: pmuludq %xmm5, %xmm1 764; SSE2-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 765; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 766; SSE2-NEXT: pmuludq %xmm13, %xmm12 767; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 768; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 769; SSE2-NEXT: pcmpeqd %xmm10, %xmm15 770; SSE2-NEXT: pxor %xmm11, %xmm15 771; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 772; SSE2-NEXT: pmuludq %xmm6, %xmm2 773; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 774; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 775; SSE2-NEXT: pmuludq %xmm14, %xmm13 776; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 777; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 778; SSE2-NEXT: pcmpeqd %xmm10, %xmm5 779; SSE2-NEXT: pxor %xmm11, %xmm5 780; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 781; SSE2-NEXT: pmuludq %xmm7, %xmm3 782; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 783; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 784; SSE2-NEXT: pmuludq %xmm14, %xmm7 785; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 786; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 787; SSE2-NEXT: pcmpeqd %xmm10, %xmm6 788; SSE2-NEXT: pxor %xmm11, %xmm6 789; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 790; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 791; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 792; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 793; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 794; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 795; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 796; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 797; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 798; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 799; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 800; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 801; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 802; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 803; SSE2-NEXT: movdqa %xmm1, 16(%rdi) 804; SSE2-NEXT: movdqa %xmm8, (%rdi) 805; SSE2-NEXT: movdqa %xmm15, %xmm1 806; SSE2-NEXT: movdqa %xmm5, %xmm2 807; SSE2-NEXT: movdqa %xmm6, %xmm3 808; SSE2-NEXT: retq 809; 810; SSSE3-LABEL: umulo_v16i32: 811; SSSE3: # %bb.0: 812; SSSE3-NEXT: movdqa %xmm0, %xmm8 813; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] 814; SSSE3-NEXT: pmuludq %xmm4, %xmm8 815; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,3,2,3] 816; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm4[1,1,3,3] 817; SSSE3-NEXT: pmuludq %xmm10, %xmm9 818; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[1,3,2,3] 819; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 820; SSSE3-NEXT: pxor %xmm10, %xmm10 821; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 822; SSSE3-NEXT: pcmpeqd %xmm11, %xmm11 823; SSSE3-NEXT: pxor %xmm11, %xmm0 824; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] 825; SSSE3-NEXT: pmuludq %xmm5, %xmm1 826; SSSE3-NEXT: pshufd {{.*#+}} xmm15 = xmm1[1,3,2,3] 827; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,3,3] 828; SSSE3-NEXT: pmuludq %xmm13, %xmm12 829; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm12[1,3,2,3] 830; SSSE3-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1] 831; SSSE3-NEXT: pcmpeqd %xmm10, %xmm15 832; SSSE3-NEXT: pxor %xmm11, %xmm15 833; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm2[1,1,3,3] 834; SSSE3-NEXT: pmuludq %xmm6, %xmm2 835; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,3,2,3] 836; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm6[1,1,3,3] 837; SSSE3-NEXT: pmuludq %xmm14, %xmm13 838; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm13[1,3,2,3] 839; SSSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 840; SSSE3-NEXT: pcmpeqd %xmm10, %xmm5 841; SSSE3-NEXT: pxor %xmm11, %xmm5 842; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 843; SSSE3-NEXT: pmuludq %xmm7, %xmm3 844; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[1,3,2,3] 845; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 846; SSSE3-NEXT: pmuludq %xmm14, %xmm7 847; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,3,2,3] 848; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 849; SSSE3-NEXT: pcmpeqd %xmm10, %xmm6 850; SSSE3-NEXT: pxor %xmm11, %xmm6 851; SSSE3-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 852; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 853; SSSE3-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 854; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 855; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 856; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 857; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 858; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 859; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 860; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 861; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 862; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 863; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 864; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 865; SSSE3-NEXT: movdqa %xmm1, 16(%rdi) 866; SSSE3-NEXT: movdqa %xmm8, (%rdi) 867; SSSE3-NEXT: movdqa %xmm15, %xmm1 868; SSSE3-NEXT: movdqa %xmm5, %xmm2 869; SSSE3-NEXT: movdqa %xmm6, %xmm3 870; SSSE3-NEXT: retq 871; 872; SSE41-LABEL: umulo_v16i32: 873; SSE41: # %bb.0: 874; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] 875; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm0[1,1,3,3] 876; SSE41-NEXT: pmuludq %xmm8, %xmm9 877; SSE41-NEXT: movdqa %xmm0, %xmm8 878; SSE41-NEXT: pmuludq %xmm4, %xmm8 879; SSE41-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,1,3,3] 880; SSE41-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5],xmm9[6,7] 881; SSE41-NEXT: pxor %xmm12, %xmm12 882; SSE41-NEXT: pcmpeqd %xmm12, %xmm8 883; SSE41-NEXT: pcmpeqd %xmm13, %xmm13 884; SSE41-NEXT: pxor %xmm13, %xmm8 885; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,3,3] 886; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm1[1,1,3,3] 887; SSE41-NEXT: pmuludq %xmm9, %xmm10 888; SSE41-NEXT: movdqa %xmm1, %xmm9 889; SSE41-NEXT: pmuludq %xmm5, %xmm9 890; SSE41-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,3,3] 891; SSE41-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3],xmm9[4,5],xmm10[6,7] 892; SSE41-NEXT: pcmpeqd %xmm12, %xmm9 893; SSE41-NEXT: pxor %xmm13, %xmm9 894; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm6[1,1,3,3] 895; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm2[1,1,3,3] 896; SSE41-NEXT: pmuludq %xmm10, %xmm11 897; SSE41-NEXT: movdqa %xmm2, %xmm10 898; SSE41-NEXT: pmuludq %xmm6, %xmm10 899; SSE41-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] 900; SSE41-NEXT: pblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] 901; SSE41-NEXT: pcmpeqd %xmm12, %xmm10 902; SSE41-NEXT: pxor %xmm13, %xmm10 903; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[1,1,3,3] 904; SSE41-NEXT: pshufd {{.*#+}} xmm14 = xmm3[1,1,3,3] 905; SSE41-NEXT: pmuludq %xmm11, %xmm14 906; SSE41-NEXT: movdqa %xmm3, %xmm11 907; SSE41-NEXT: pmuludq %xmm7, %xmm11 908; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] 909; SSE41-NEXT: pblendw {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3],xmm11[4,5],xmm14[6,7] 910; SSE41-NEXT: pcmpeqd %xmm12, %xmm11 911; SSE41-NEXT: pxor %xmm13, %xmm11 912; SSE41-NEXT: pmulld %xmm4, %xmm0 913; SSE41-NEXT: pmulld %xmm5, %xmm1 914; SSE41-NEXT: pmulld %xmm6, %xmm2 915; SSE41-NEXT: pmulld %xmm7, %xmm3 916; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 917; SSE41-NEXT: movdqa %xmm2, 32(%rdi) 918; SSE41-NEXT: movdqa %xmm1, 16(%rdi) 919; SSE41-NEXT: movdqa %xmm0, (%rdi) 920; SSE41-NEXT: movdqa %xmm8, %xmm0 921; SSE41-NEXT: movdqa %xmm9, %xmm1 922; SSE41-NEXT: movdqa %xmm10, %xmm2 923; SSE41-NEXT: movdqa %xmm11, %xmm3 924; SSE41-NEXT: retq 925; 926; AVX1-LABEL: umulo_v16i32: 927; AVX1: # %bb.0: 928; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 929; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,3,3] 930; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 931; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[1,1,3,3] 932; AVX1-NEXT: vpmuludq %xmm6, %xmm7, %xmm6 933; AVX1-NEXT: vpmuludq %xmm10, %xmm12, %xmm7 934; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 935; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7] 936; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 937; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm7 938; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9 939; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7 940; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,3,3] 941; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] 942; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm4 943; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm6 944; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] 945; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3],xmm6[4,5],xmm4[6,7] 946; AVX1-NEXT: vpcmpeqd %xmm4, %xmm8, %xmm4 947; AVX1-NEXT: vpxor %xmm4, %xmm9, %xmm4 948; AVX1-NEXT: vpackssdw %xmm7, %xmm4, %xmm11 949; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 950; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] 951; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 952; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] 953; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 954; AVX1-NEXT: vpmuludq %xmm6, %xmm4, %xmm7 955; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 956; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 957; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 958; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm13 959; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] 960; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 961; AVX1-NEXT: vpmuludq %xmm7, %xmm5, %xmm5 962; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm7 963; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] 964; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3],xmm7[4,5],xmm5[6,7] 965; AVX1-NEXT: vpcmpeqd %xmm5, %xmm8, %xmm5 966; AVX1-NEXT: vpxor %xmm5, %xmm9, %xmm5 967; AVX1-NEXT: vpackssdw %xmm13, %xmm5, %xmm5 968; AVX1-NEXT: vpacksswb %xmm11, %xmm5, %xmm5 969; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2 970; AVX1-NEXT: vpmulld %xmm6, %xmm4, %xmm4 971; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3 972; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm6 973; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 974; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 975; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 976; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 977; AVX1-NEXT: vpacksswb %xmm11, %xmm11, %xmm1 978; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5 979; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 980; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 981; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 982; AVX1-NEXT: vmovdqa %xmm6, 48(%rdi) 983; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi) 984; AVX1-NEXT: vmovdqa %xmm4, 16(%rdi) 985; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 986; AVX1-NEXT: retq 987; 988; AVX2-LABEL: umulo_v16i32: 989; AVX2: # %bb.0: 990; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[1,1,3,3,5,5,7,7] 991; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,1,3,3,5,5,7,7] 992; AVX2-NEXT: vpmuludq %ymm4, %ymm5, %ymm4 993; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 994; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,3,3,5,5,7,7] 995; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] 996; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 997; AVX2-NEXT: vpcmpeqd %ymm5, %ymm4, %ymm4 998; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 999; AVX2-NEXT: vpxor %ymm6, %ymm4, %ymm4 1000; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7 1001; AVX2-NEXT: vpackssdw %xmm7, %xmm4, %xmm4 1002; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[1,1,3,3,5,5,7,7] 1003; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[1,1,3,3,5,5,7,7] 1004; AVX2-NEXT: vpmuludq %ymm7, %ymm8, %ymm7 1005; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm8 1006; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[1,1,3,3,5,5,7,7] 1007; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] 1008; AVX2-NEXT: vpcmpeqd %ymm5, %ymm7, %ymm5 1009; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5 1010; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1011; AVX2-NEXT: vpackssdw %xmm6, %xmm5, %xmm5 1012; AVX2-NEXT: vpacksswb %xmm5, %xmm5, %xmm5 1013; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm2 1014; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm3 1015; AVX2-NEXT: vpmovsxbd %xmm5, %ymm0 1016; AVX2-NEXT: vpacksswb %xmm4, %xmm4, %xmm1 1017; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1018; AVX2-NEXT: vmovdqa %ymm3, 32(%rdi) 1019; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 1020; AVX2-NEXT: retq 1021; 1022; AVX512-LABEL: umulo_v16i32: 1023; AVX512: # %bb.0: 1024; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 1025; AVX512-NEXT: vpshufd {{.*#+}} zmm3 = zmm1[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1026; AVX512-NEXT: vpshufd {{.*#+}} zmm4 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] 1027; AVX512-NEXT: vpmuludq %zmm3, %zmm4, %zmm3 1028; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [1,17,3,19,5,21,7,23,9,25,11,27,13,29,15,31] 1029; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4 1030; AVX512-NEXT: vptestmd %zmm4, %zmm4, %k1 1031; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm1 1032; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1033; AVX512-NEXT: vmovdqa64 %zmm1, (%rdi) 1034; AVX512-NEXT: retq 1035 %t = call {<16 x i32>, <16 x i1>} @llvm.umul.with.overflow.v16i32(<16 x i32> %a0, <16 x i32> %a1) 1036 %val = extractvalue {<16 x i32>, <16 x i1>} %t, 0 1037 %obit = extractvalue {<16 x i32>, <16 x i1>} %t, 1 1038 %res = sext <16 x i1> %obit to <16 x i32> 1039 store <16 x i32> %val, ptr %p2 1040 ret <16 x i32> %res 1041} 1042 1043define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind { 1044; SSE2-LABEL: umulo_v16i8: 1045; SSE2: # %bb.0: 1046; SSE2-NEXT: pxor %xmm2, %xmm2 1047; SSE2-NEXT: movdqa %xmm1, %xmm3 1048; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1049; SSE2-NEXT: movdqa %xmm0, %xmm5 1050; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1051; SSE2-NEXT: pmullw %xmm3, %xmm5 1052; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1053; SSE2-NEXT: movdqa %xmm5, %xmm3 1054; SSE2-NEXT: pand %xmm4, %xmm3 1055; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1056; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1057; SSE2-NEXT: pmullw %xmm1, %xmm0 1058; SSE2-NEXT: pand %xmm0, %xmm4 1059; SSE2-NEXT: packuswb %xmm3, %xmm4 1060; SSE2-NEXT: psrlw $8, %xmm5 1061; SSE2-NEXT: psrlw $8, %xmm0 1062; SSE2-NEXT: packuswb %xmm5, %xmm0 1063; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 1064; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 1065; SSE2-NEXT: pxor %xmm2, %xmm3 1066; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1067; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1068; SSE2-NEXT: psrad $24, %xmm0 1069; SSE2-NEXT: movdqa %xmm3, %xmm1 1070; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1071; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1072; SSE2-NEXT: pslld $31, %xmm1 1073; SSE2-NEXT: psrad $31, %xmm1 1074; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1075; SSE2-NEXT: movdqa %xmm3, %xmm2 1076; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1077; SSE2-NEXT: pslld $31, %xmm2 1078; SSE2-NEXT: psrad $31, %xmm2 1079; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1080; SSE2-NEXT: pslld $31, %xmm3 1081; SSE2-NEXT: psrad $31, %xmm3 1082; SSE2-NEXT: movdqa %xmm4, (%rdi) 1083; SSE2-NEXT: retq 1084; 1085; SSSE3-LABEL: umulo_v16i8: 1086; SSSE3: # %bb.0: 1087; SSSE3-NEXT: pxor %xmm2, %xmm2 1088; SSSE3-NEXT: movdqa %xmm1, %xmm3 1089; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] 1090; SSSE3-NEXT: movdqa %xmm0, %xmm5 1091; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] 1092; SSSE3-NEXT: pmullw %xmm3, %xmm5 1093; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1094; SSSE3-NEXT: movdqa %xmm5, %xmm3 1095; SSSE3-NEXT: pand %xmm4, %xmm3 1096; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1097; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 1098; SSSE3-NEXT: pmullw %xmm1, %xmm0 1099; SSSE3-NEXT: pand %xmm0, %xmm4 1100; SSSE3-NEXT: packuswb %xmm3, %xmm4 1101; SSSE3-NEXT: psrlw $8, %xmm5 1102; SSSE3-NEXT: psrlw $8, %xmm0 1103; SSSE3-NEXT: packuswb %xmm5, %xmm0 1104; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 1105; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 1106; SSSE3-NEXT: pxor %xmm2, %xmm3 1107; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] 1108; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1109; SSSE3-NEXT: psrad $24, %xmm0 1110; SSSE3-NEXT: movdqa %xmm3, %xmm1 1111; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1112; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1113; SSSE3-NEXT: pslld $31, %xmm1 1114; SSSE3-NEXT: psrad $31, %xmm1 1115; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1116; SSSE3-NEXT: movdqa %xmm3, %xmm2 1117; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1118; SSSE3-NEXT: pslld $31, %xmm2 1119; SSSE3-NEXT: psrad $31, %xmm2 1120; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1121; SSSE3-NEXT: pslld $31, %xmm3 1122; SSSE3-NEXT: psrad $31, %xmm3 1123; SSSE3-NEXT: movdqa %xmm4, (%rdi) 1124; SSSE3-NEXT: retq 1125; 1126; SSE41-LABEL: umulo_v16i8: 1127; SSE41: # %bb.0: 1128; SSE41-NEXT: pxor %xmm2, %xmm2 1129; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1130; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1131; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1132; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1133; SSE41-NEXT: pmullw %xmm1, %xmm0 1134; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1135; SSE41-NEXT: movdqa %xmm0, %xmm1 1136; SSE41-NEXT: pand %xmm4, %xmm1 1137; SSE41-NEXT: pmullw %xmm3, %xmm5 1138; SSE41-NEXT: pand %xmm5, %xmm4 1139; SSE41-NEXT: packuswb %xmm1, %xmm4 1140; SSE41-NEXT: psrlw $8, %xmm0 1141; SSE41-NEXT: psrlw $8, %xmm5 1142; SSE41-NEXT: packuswb %xmm0, %xmm5 1143; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 1144; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 1145; SSE41-NEXT: pxor %xmm5, %xmm3 1146; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 1147; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] 1148; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1149; SSE41-NEXT: pslld $31, %xmm1 1150; SSE41-NEXT: psrad $31, %xmm1 1151; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3] 1152; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1153; SSE41-NEXT: pslld $31, %xmm2 1154; SSE41-NEXT: psrad $31, %xmm2 1155; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] 1156; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1157; SSE41-NEXT: pslld $31, %xmm3 1158; SSE41-NEXT: psrad $31, %xmm3 1159; SSE41-NEXT: movdqa %xmm4, (%rdi) 1160; SSE41-NEXT: retq 1161; 1162; AVX1-LABEL: umulo_v16i8: 1163; AVX1: # %bb.0: 1164; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1165; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1166; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1167; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1168; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 1169; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm5 1170; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1171; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1172; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1173; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm1 1174; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm4 1175; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 1176; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1177; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1178; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1179; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1180; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1181; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1182; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1183; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1184; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1185; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 1186; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1187; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1188; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1189; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 1190; AVX1-NEXT: vmovdqa %xmm4, (%rdi) 1191; AVX1-NEXT: retq 1192; 1193; AVX2-LABEL: umulo_v16i8: 1194; AVX2: # %bb.0: 1195; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1196; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1197; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1198; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 1199; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 1200; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm2 1201; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1202; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1203; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1204; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 1205; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 1206; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1207; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 1208; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1209; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1210; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1211; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 1212; AVX2-NEXT: retq 1213; 1214; AVX512F-LABEL: umulo_v16i8: 1215; AVX512F: # %bb.0: 1216; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1217; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1218; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1219; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm0 1220; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1221; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 1222; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1223; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero 1224; AVX512F-NEXT: vpmovdb %zmm1, (%rdi) 1225; AVX512F-NEXT: retq 1226; 1227; AVX512BW-LABEL: umulo_v16i8: 1228; AVX512BW: # %bb.0: 1229; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1230; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1231; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm1 1232; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm0 1233; AVX512BW-NEXT: vptestmw %ymm0, %ymm0, %k1 1234; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1235; AVX512BW-NEXT: vpmovwb %ymm1, (%rdi) 1236; AVX512BW-NEXT: retq 1237 %t = call {<16 x i8>, <16 x i1>} @llvm.umul.with.overflow.v16i8(<16 x i8> %a0, <16 x i8> %a1) 1238 %val = extractvalue {<16 x i8>, <16 x i1>} %t, 0 1239 %obit = extractvalue {<16 x i8>, <16 x i1>} %t, 1 1240 %res = sext <16 x i1> %obit to <16 x i32> 1241 store <16 x i8> %val, ptr %p2 1242 ret <16 x i32> %res 1243} 1244 1245define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, ptr %p2) nounwind { 1246; SSE2-LABEL: umulo_v32i8: 1247; SSE2: # %bb.0: 1248; SSE2-NEXT: movq %rdi, %rax 1249; SSE2-NEXT: pxor %xmm5, %xmm5 1250; SSE2-NEXT: movdqa %xmm2, %xmm4 1251; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1252; SSE2-NEXT: movdqa %xmm0, %xmm6 1253; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1254; SSE2-NEXT: pmullw %xmm4, %xmm6 1255; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] 1256; SSE2-NEXT: movdqa %xmm6, %xmm7 1257; SSE2-NEXT: pand %xmm11, %xmm7 1258; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1259; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1260; SSE2-NEXT: pmullw %xmm2, %xmm0 1261; SSE2-NEXT: movdqa %xmm0, %xmm8 1262; SSE2-NEXT: pand %xmm11, %xmm8 1263; SSE2-NEXT: packuswb %xmm7, %xmm8 1264; SSE2-NEXT: movdqa %xmm3, %xmm7 1265; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1266; SSE2-NEXT: movdqa %xmm1, %xmm2 1267; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 1268; SSE2-NEXT: pmullw %xmm7, %xmm2 1269; SSE2-NEXT: movdqa %xmm2, %xmm7 1270; SSE2-NEXT: pand %xmm11, %xmm7 1271; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1272; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1273; SSE2-NEXT: pmullw %xmm3, %xmm1 1274; SSE2-NEXT: pand %xmm1, %xmm11 1275; SSE2-NEXT: packuswb %xmm7, %xmm11 1276; SSE2-NEXT: psrlw $8, %xmm2 1277; SSE2-NEXT: psrlw $8, %xmm1 1278; SSE2-NEXT: packuswb %xmm2, %xmm1 1279; SSE2-NEXT: pcmpeqb %xmm5, %xmm1 1280; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 1281; SSE2-NEXT: pxor %xmm2, %xmm1 1282; SSE2-NEXT: psrlw $8, %xmm6 1283; SSE2-NEXT: psrlw $8, %xmm0 1284; SSE2-NEXT: packuswb %xmm6, %xmm0 1285; SSE2-NEXT: pcmpeqb %xmm5, %xmm0 1286; SSE2-NEXT: pxor %xmm2, %xmm0 1287; SSE2-NEXT: movdqa %xmm0, %xmm3 1288; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 1289; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1290; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1291; SSE2-NEXT: pslld $31, %xmm0 1292; SSE2-NEXT: psrad $31, %xmm0 1293; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1294; SSE2-NEXT: movdqa %xmm3, %xmm5 1295; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1296; SSE2-NEXT: pslld $31, %xmm5 1297; SSE2-NEXT: psrad $31, %xmm5 1298; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1299; SSE2-NEXT: pslld $31, %xmm3 1300; SSE2-NEXT: psrad $31, %xmm3 1301; SSE2-NEXT: movdqa %xmm1, %xmm6 1302; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1303; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1304; SSE2-NEXT: pslld $31, %xmm6 1305; SSE2-NEXT: psrad $31, %xmm6 1306; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1307; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1308; SSE2-NEXT: movdqa %xmm1, %xmm2 1309; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1310; SSE2-NEXT: pslld $31, %xmm2 1311; SSE2-NEXT: psrad $31, %xmm2 1312; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1313; SSE2-NEXT: pslld $31, %xmm1 1314; SSE2-NEXT: psrad $31, %xmm1 1315; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] 1316; SSE2-NEXT: psrad $24, %xmm7 1317; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 1318; SSE2-NEXT: psrad $24, %xmm4 1319; SSE2-NEXT: movdqa %xmm11, 16(%rsi) 1320; SSE2-NEXT: movdqa %xmm8, (%rsi) 1321; SSE2-NEXT: movdqa %xmm4, 64(%rdi) 1322; SSE2-NEXT: movdqa %xmm7, (%rdi) 1323; SSE2-NEXT: movdqa %xmm1, 112(%rdi) 1324; SSE2-NEXT: movdqa %xmm2, 96(%rdi) 1325; SSE2-NEXT: movdqa %xmm6, 80(%rdi) 1326; SSE2-NEXT: movdqa %xmm3, 48(%rdi) 1327; SSE2-NEXT: movdqa %xmm5, 32(%rdi) 1328; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1329; SSE2-NEXT: retq 1330; 1331; SSSE3-LABEL: umulo_v32i8: 1332; SSSE3: # %bb.0: 1333; SSSE3-NEXT: movq %rdi, %rax 1334; SSSE3-NEXT: pxor %xmm5, %xmm5 1335; SSSE3-NEXT: movdqa %xmm2, %xmm4 1336; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15] 1337; SSSE3-NEXT: movdqa %xmm0, %xmm6 1338; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] 1339; SSSE3-NEXT: pmullw %xmm4, %xmm6 1340; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255] 1341; SSSE3-NEXT: movdqa %xmm6, %xmm7 1342; SSSE3-NEXT: pand %xmm11, %xmm7 1343; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] 1344; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] 1345; SSSE3-NEXT: pmullw %xmm2, %xmm0 1346; SSSE3-NEXT: movdqa %xmm0, %xmm8 1347; SSSE3-NEXT: pand %xmm11, %xmm8 1348; SSSE3-NEXT: packuswb %xmm7, %xmm8 1349; SSSE3-NEXT: movdqa %xmm3, %xmm7 1350; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] 1351; SSSE3-NEXT: movdqa %xmm1, %xmm2 1352; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 1353; SSSE3-NEXT: pmullw %xmm7, %xmm2 1354; SSSE3-NEXT: movdqa %xmm2, %xmm7 1355; SSSE3-NEXT: pand %xmm11, %xmm7 1356; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] 1357; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 1358; SSSE3-NEXT: pmullw %xmm3, %xmm1 1359; SSSE3-NEXT: pand %xmm1, %xmm11 1360; SSSE3-NEXT: packuswb %xmm7, %xmm11 1361; SSSE3-NEXT: psrlw $8, %xmm2 1362; SSSE3-NEXT: psrlw $8, %xmm1 1363; SSSE3-NEXT: packuswb %xmm2, %xmm1 1364; SSSE3-NEXT: pcmpeqb %xmm5, %xmm1 1365; SSSE3-NEXT: pcmpeqd %xmm2, %xmm2 1366; SSSE3-NEXT: pxor %xmm2, %xmm1 1367; SSSE3-NEXT: psrlw $8, %xmm6 1368; SSSE3-NEXT: psrlw $8, %xmm0 1369; SSSE3-NEXT: packuswb %xmm6, %xmm0 1370; SSSE3-NEXT: pcmpeqb %xmm5, %xmm0 1371; SSSE3-NEXT: pxor %xmm2, %xmm0 1372; SSSE3-NEXT: movdqa %xmm0, %xmm3 1373; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] 1374; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1375; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1376; SSSE3-NEXT: pslld $31, %xmm0 1377; SSSE3-NEXT: psrad $31, %xmm0 1378; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1379; SSSE3-NEXT: movdqa %xmm3, %xmm5 1380; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1381; SSSE3-NEXT: pslld $31, %xmm5 1382; SSSE3-NEXT: psrad $31, %xmm5 1383; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1384; SSSE3-NEXT: pslld $31, %xmm3 1385; SSSE3-NEXT: psrad $31, %xmm3 1386; SSSE3-NEXT: movdqa %xmm1, %xmm6 1387; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1388; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] 1389; SSSE3-NEXT: pslld $31, %xmm6 1390; SSSE3-NEXT: psrad $31, %xmm6 1391; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] 1392; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1393; SSSE3-NEXT: movdqa %xmm1, %xmm2 1394; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1395; SSSE3-NEXT: pslld $31, %xmm2 1396; SSSE3-NEXT: psrad $31, %xmm2 1397; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1398; SSSE3-NEXT: pslld $31, %xmm1 1399; SSSE3-NEXT: psrad $31, %xmm1 1400; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3] 1401; SSSE3-NEXT: psrad $24, %xmm7 1402; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3] 1403; SSSE3-NEXT: psrad $24, %xmm4 1404; SSSE3-NEXT: movdqa %xmm11, 16(%rsi) 1405; SSSE3-NEXT: movdqa %xmm8, (%rsi) 1406; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) 1407; SSSE3-NEXT: movdqa %xmm7, (%rdi) 1408; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) 1409; SSSE3-NEXT: movdqa %xmm2, 96(%rdi) 1410; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) 1411; SSSE3-NEXT: movdqa %xmm3, 48(%rdi) 1412; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) 1413; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) 1414; SSSE3-NEXT: retq 1415; 1416; SSE41-LABEL: umulo_v32i8: 1417; SSE41: # %bb.0: 1418; SSE41-NEXT: movq %rdi, %rax 1419; SSE41-NEXT: pxor %xmm8, %xmm8 1420; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1421; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm8[8],xmm2[9],xmm8[9],xmm2[10],xmm8[10],xmm2[11],xmm8[11],xmm2[12],xmm8[12],xmm2[13],xmm8[13],xmm2[14],xmm8[14],xmm2[15],xmm8[15] 1422; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1423; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] 1424; SSE41-NEXT: pmullw %xmm2, %xmm0 1425; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] 1426; SSE41-NEXT: movdqa %xmm0, %xmm6 1427; SSE41-NEXT: pand %xmm10, %xmm6 1428; SSE41-NEXT: pmullw %xmm5, %xmm4 1429; SSE41-NEXT: movdqa %xmm4, %xmm9 1430; SSE41-NEXT: pand %xmm10, %xmm9 1431; SSE41-NEXT: packuswb %xmm6, %xmm9 1432; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1433; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm8[8],xmm3[9],xmm8[9],xmm3[10],xmm8[10],xmm3[11],xmm8[11],xmm3[12],xmm8[12],xmm3[13],xmm8[13],xmm3[14],xmm8[14],xmm3[15],xmm8[15] 1434; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1435; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm8[8],xmm1[9],xmm8[9],xmm1[10],xmm8[10],xmm1[11],xmm8[11],xmm1[12],xmm8[12],xmm1[13],xmm8[13],xmm1[14],xmm8[14],xmm1[15],xmm8[15] 1436; SSE41-NEXT: pmullw %xmm3, %xmm1 1437; SSE41-NEXT: movdqa %xmm1, %xmm3 1438; SSE41-NEXT: pand %xmm10, %xmm3 1439; SSE41-NEXT: pmullw %xmm7, %xmm6 1440; SSE41-NEXT: pand %xmm6, %xmm10 1441; SSE41-NEXT: packuswb %xmm3, %xmm10 1442; SSE41-NEXT: psrlw $8, %xmm1 1443; SSE41-NEXT: psrlw $8, %xmm6 1444; SSE41-NEXT: packuswb %xmm1, %xmm6 1445; SSE41-NEXT: pcmpeqb %xmm8, %xmm6 1446; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 1447; SSE41-NEXT: pxor %xmm1, %xmm6 1448; SSE41-NEXT: psrlw $8, %xmm0 1449; SSE41-NEXT: psrlw $8, %xmm4 1450; SSE41-NEXT: packuswb %xmm0, %xmm4 1451; SSE41-NEXT: pcmpeqb %xmm8, %xmm4 1452; SSE41-NEXT: pxor %xmm1, %xmm4 1453; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 1454; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 1455; SSE41-NEXT: pslld $31, %xmm0 1456; SSE41-NEXT: psrad $31, %xmm0 1457; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3] 1458; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero 1459; SSE41-NEXT: pslld $31, %xmm1 1460; SSE41-NEXT: psrad $31, %xmm1 1461; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[3,3,3,3] 1462; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero 1463; SSE41-NEXT: pslld $31, %xmm3 1464; SSE41-NEXT: psrad $31, %xmm3 1465; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1] 1466; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero 1467; SSE41-NEXT: pslld $31, %xmm7 1468; SSE41-NEXT: psrad $31, %xmm7 1469; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,2,3] 1470; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero 1471; SSE41-NEXT: pslld $31, %xmm5 1472; SSE41-NEXT: psrad $31, %xmm5 1473; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm6[3,3,3,3] 1474; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero 1475; SSE41-NEXT: pslld $31, %xmm2 1476; SSE41-NEXT: psrad $31, %xmm2 1477; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 1478; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 1479; SSE41-NEXT: movdqa %xmm10, 16(%rsi) 1480; SSE41-NEXT: movdqa %xmm9, (%rsi) 1481; SSE41-NEXT: movdqa %xmm6, 64(%rdi) 1482; SSE41-NEXT: movdqa %xmm4, (%rdi) 1483; SSE41-NEXT: movdqa %xmm2, 112(%rdi) 1484; SSE41-NEXT: movdqa %xmm5, 96(%rdi) 1485; SSE41-NEXT: movdqa %xmm7, 80(%rdi) 1486; SSE41-NEXT: movdqa %xmm3, 48(%rdi) 1487; SSE41-NEXT: movdqa %xmm1, 32(%rdi) 1488; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 1489; SSE41-NEXT: retq 1490; 1491; AVX1-LABEL: umulo_v32i8: 1492; AVX1: # %bb.0: 1493; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 1494; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1495; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1496; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 1497; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 1498; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm4 1499; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1500; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1501; AVX1-NEXT: vpmullw %xmm6, %xmm7, %xmm6 1502; AVX1-NEXT: vpand %xmm5, %xmm6, %xmm7 1503; AVX1-NEXT: vpackuswb %xmm4, %xmm7, %xmm8 1504; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 1505; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 1506; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 1507; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 1508; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 1509; AVX1-NEXT: vpand %xmm5, %xmm4, %xmm7 1510; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1511; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1512; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 1513; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm1 1514; AVX1-NEXT: vpackuswb %xmm7, %xmm1, %xmm5 1515; AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 1516; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 1517; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 1518; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1519; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 1520; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 1521; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm0 1522; AVX1-NEXT: vpsrlw $8, %xmm6, %xmm3 1523; AVX1-NEXT: vpackuswb %xmm0, %xmm3, %xmm0 1524; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 1525; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 1526; AVX1-NEXT: vpmovsxbd %xmm1, %xmm0 1527; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1] 1528; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 1529; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 1530; AVX1-NEXT: vpmovsxbd %xmm4, %xmm2 1531; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] 1532; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1533; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1534; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] 1535; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1536; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] 1537; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 1538; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 1539; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] 1540; AVX1-NEXT: vpmovsxbd %xmm3, %xmm3 1541; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,3,3,3] 1542; AVX1-NEXT: vpmovsxbd %xmm4, %xmm4 1543; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 1544; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi) 1545; AVX1-NEXT: vmovdqa %xmm8, (%rdi) 1546; AVX1-NEXT: retq 1547; 1548; AVX2-LABEL: umulo_v32i8: 1549; AVX2: # %bb.0: 1550; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 1551; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] 1552; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] 1553; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 1554; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1555; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 1556; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] 1557; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] 1558; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 1559; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm1 1560; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm4 1561; AVX2-NEXT: vpsrlw $8, %ymm3, %ymm1 1562; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 1563; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 1564; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 1565; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 1566; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 1567; AVX2-NEXT: vpmovsxbd %xmm1, %ymm0 1568; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1569; AVX2-NEXT: vpmovsxbd %xmm3, %ymm2 1570; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1571; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 1572; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] 1573; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 1574; AVX2-NEXT: vmovdqa %ymm4, (%rdi) 1575; AVX2-NEXT: retq 1576; 1577; AVX512F-LABEL: umulo_v32i8: 1578; AVX512F: # %bb.0: 1579; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 1580; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 1581; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 1582; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 1583; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 1584; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm3 1585; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1586; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 1587; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 1588; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 1589; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm3 1590; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm0 1591; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 1592; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 1593; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} 1594; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1595; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 1596; AVX512F-NEXT: vpmovdb %zmm2, 16(%rdi) 1597; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 1598; AVX512F-NEXT: vpmovdb %zmm2, (%rdi) 1599; AVX512F-NEXT: retq 1600; 1601; AVX512BW-LABEL: umulo_v32i8: 1602; AVX512BW: # %bb.0: 1603; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 1604; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 1605; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm2 1606; AVX512BW-NEXT: vpsrlw $8, %zmm2, %zmm0 1607; AVX512BW-NEXT: vptestmw %zmm0, %zmm0, %k1 1608; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 1609; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 1610; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} 1611; AVX512BW-NEXT: vpmovwb %zmm2, (%rdi) 1612; AVX512BW-NEXT: retq 1613 %t = call {<32 x i8>, <32 x i1>} @llvm.umul.with.overflow.v32i8(<32 x i8> %a0, <32 x i8> %a1) 1614 %val = extractvalue {<32 x i8>, <32 x i1>} %t, 0 1615 %obit = extractvalue {<32 x i8>, <32 x i1>} %t, 1 1616 %res = sext <32 x i1> %obit to <32 x i32> 1617 store <32 x i8> %val, ptr %p2 1618 ret <32 x i32> %res 1619} 1620 1621define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, ptr %p2) nounwind { 1622; SSE2-LABEL: umulo_v64i8: 1623; SSE2: # %bb.0: 1624; SSE2-NEXT: movq %rdi, %rax 1625; SSE2-NEXT: pxor %xmm9, %xmm9 1626; SSE2-NEXT: movdqa %xmm4, %xmm8 1627; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 1628; SSE2-NEXT: movdqa %xmm0, %xmm10 1629; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 1630; SSE2-NEXT: pmullw %xmm8, %xmm10 1631; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1632; SSE2-NEXT: movdqa %xmm10, %xmm12 1633; SSE2-NEXT: pand %xmm8, %xmm12 1634; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1635; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1636; SSE2-NEXT: pmullw %xmm4, %xmm0 1637; SSE2-NEXT: movdqa %xmm0, %xmm11 1638; SSE2-NEXT: pand %xmm8, %xmm11 1639; SSE2-NEXT: packuswb %xmm12, %xmm11 1640; SSE2-NEXT: movdqa %xmm5, %xmm4 1641; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1642; SSE2-NEXT: movdqa %xmm1, %xmm13 1643; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1644; SSE2-NEXT: pmullw %xmm4, %xmm13 1645; SSE2-NEXT: movdqa %xmm13, %xmm4 1646; SSE2-NEXT: pand %xmm8, %xmm4 1647; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 1648; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1649; SSE2-NEXT: pmullw %xmm5, %xmm1 1650; SSE2-NEXT: movdqa %xmm1, %xmm12 1651; SSE2-NEXT: pand %xmm8, %xmm12 1652; SSE2-NEXT: packuswb %xmm4, %xmm12 1653; SSE2-NEXT: movdqa %xmm6, %xmm4 1654; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1655; SSE2-NEXT: movdqa %xmm2, %xmm5 1656; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 1657; SSE2-NEXT: pmullw %xmm4, %xmm5 1658; SSE2-NEXT: movdqa %xmm5, %xmm4 1659; SSE2-NEXT: pand %xmm8, %xmm4 1660; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1661; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 1662; SSE2-NEXT: pmullw %xmm6, %xmm2 1663; SSE2-NEXT: movdqa %xmm2, %xmm14 1664; SSE2-NEXT: pand %xmm8, %xmm14 1665; SSE2-NEXT: packuswb %xmm4, %xmm14 1666; SSE2-NEXT: movdqa %xmm7, %xmm4 1667; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1668; SSE2-NEXT: movdqa %xmm3, %xmm6 1669; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 1670; SSE2-NEXT: pmullw %xmm4, %xmm6 1671; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1672; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 1673; SSE2-NEXT: pmullw %xmm7, %xmm3 1674; SSE2-NEXT: movdqa %xmm6, %xmm4 1675; SSE2-NEXT: pand %xmm8, %xmm4 1676; SSE2-NEXT: pand %xmm3, %xmm8 1677; SSE2-NEXT: packuswb %xmm4, %xmm8 1678; SSE2-NEXT: psrlw $8, %xmm6 1679; SSE2-NEXT: psrlw $8, %xmm3 1680; SSE2-NEXT: packuswb %xmm6, %xmm3 1681; SSE2-NEXT: psrlw $8, %xmm5 1682; SSE2-NEXT: psrlw $8, %xmm2 1683; SSE2-NEXT: packuswb %xmm5, %xmm2 1684; SSE2-NEXT: psrlw $8, %xmm13 1685; SSE2-NEXT: psrlw $8, %xmm1 1686; SSE2-NEXT: packuswb %xmm13, %xmm1 1687; SSE2-NEXT: psrlw $8, %xmm10 1688; SSE2-NEXT: psrlw $8, %xmm0 1689; SSE2-NEXT: packuswb %xmm10, %xmm0 1690; SSE2-NEXT: pcmpeqb %xmm9, %xmm3 1691; SSE2-NEXT: pcmpeqb %xmm9, %xmm2 1692; SSE2-NEXT: pcmpeqb %xmm9, %xmm1 1693; SSE2-NEXT: pcmpeqb %xmm9, %xmm0 1694; SSE2-NEXT: pcmpeqd %xmm4, %xmm4 1695; SSE2-NEXT: pxor %xmm4, %xmm3 1696; SSE2-NEXT: pxor %xmm4, %xmm2 1697; SSE2-NEXT: pxor %xmm4, %xmm1 1698; SSE2-NEXT: pxor %xmm4, %xmm0 1699; SSE2-NEXT: movdqa %xmm8, 48(%rsi) 1700; SSE2-NEXT: movdqa %xmm14, 32(%rsi) 1701; SSE2-NEXT: movdqa %xmm12, 16(%rsi) 1702; SSE2-NEXT: movdqa %xmm3, %xmm4 1703; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1704; SSE2-NEXT: movdqa %xmm11, (%rsi) 1705; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1706; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1707; SSE2-NEXT: psrad $24, %xmm5 1708; SSE2-NEXT: movdqa %xmm5, 192(%rdi) 1709; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 1710; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1711; SSE2-NEXT: psrad $24, %xmm5 1712; SSE2-NEXT: movdqa %xmm5, 128(%rdi) 1713; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1714; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1715; SSE2-NEXT: psrad $24, %xmm5 1716; SSE2-NEXT: movdqa %xmm5, 64(%rdi) 1717; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 1718; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1719; SSE2-NEXT: psrad $24, %xmm5 1720; SSE2-NEXT: movdqa %xmm5, (%rdi) 1721; SSE2-NEXT: movdqa %xmm4, %xmm5 1722; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1723; SSE2-NEXT: pslld $31, %xmm4 1724; SSE2-NEXT: psrad $31, %xmm4 1725; SSE2-NEXT: movdqa %xmm4, 224(%rdi) 1726; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1727; SSE2-NEXT: pslld $31, %xmm5 1728; SSE2-NEXT: psrad $31, %xmm5 1729; SSE2-NEXT: movdqa %xmm5, 240(%rdi) 1730; SSE2-NEXT: movdqa %xmm2, %xmm4 1731; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1732; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1733; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1734; SSE2-NEXT: pslld $31, %xmm3 1735; SSE2-NEXT: psrad $31, %xmm3 1736; SSE2-NEXT: movdqa %xmm3, 208(%rdi) 1737; SSE2-NEXT: movdqa %xmm4, %xmm3 1738; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1739; SSE2-NEXT: pslld $31, %xmm4 1740; SSE2-NEXT: psrad $31, %xmm4 1741; SSE2-NEXT: movdqa %xmm4, 160(%rdi) 1742; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1743; SSE2-NEXT: pslld $31, %xmm3 1744; SSE2-NEXT: psrad $31, %xmm3 1745; SSE2-NEXT: movdqa %xmm3, 176(%rdi) 1746; SSE2-NEXT: movdqa %xmm1, %xmm3 1747; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1748; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1749; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1750; SSE2-NEXT: pslld $31, %xmm2 1751; SSE2-NEXT: psrad $31, %xmm2 1752; SSE2-NEXT: movdqa %xmm2, 144(%rdi) 1753; SSE2-NEXT: movdqa %xmm3, %xmm2 1754; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1755; SSE2-NEXT: pslld $31, %xmm3 1756; SSE2-NEXT: psrad $31, %xmm3 1757; SSE2-NEXT: movdqa %xmm3, 96(%rdi) 1758; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1759; SSE2-NEXT: pslld $31, %xmm2 1760; SSE2-NEXT: psrad $31, %xmm2 1761; SSE2-NEXT: movdqa %xmm2, 112(%rdi) 1762; SSE2-NEXT: movdqa %xmm0, %xmm2 1763; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1764; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1765; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1766; SSE2-NEXT: pslld $31, %xmm1 1767; SSE2-NEXT: psrad $31, %xmm1 1768; SSE2-NEXT: movdqa %xmm1, 80(%rdi) 1769; SSE2-NEXT: movdqa %xmm2, %xmm1 1770; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1771; SSE2-NEXT: pslld $31, %xmm2 1772; SSE2-NEXT: psrad $31, %xmm2 1773; SSE2-NEXT: movdqa %xmm2, 32(%rdi) 1774; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1775; SSE2-NEXT: pslld $31, %xmm1 1776; SSE2-NEXT: psrad $31, %xmm1 1777; SSE2-NEXT: movdqa %xmm1, 48(%rdi) 1778; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1779; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1780; SSE2-NEXT: pslld $31, %xmm0 1781; SSE2-NEXT: psrad $31, %xmm0 1782; SSE2-NEXT: movdqa %xmm0, 16(%rdi) 1783; SSE2-NEXT: retq 1784; 1785; SSSE3-LABEL: umulo_v64i8: 1786; SSSE3: # %bb.0: 1787; SSSE3-NEXT: movq %rdi, %rax 1788; SSSE3-NEXT: pxor %xmm9, %xmm9 1789; SSSE3-NEXT: movdqa %xmm4, %xmm8 1790; SSSE3-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] 1791; SSSE3-NEXT: movdqa %xmm0, %xmm10 1792; SSSE3-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] 1793; SSSE3-NEXT: pmullw %xmm8, %xmm10 1794; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 1795; SSSE3-NEXT: movdqa %xmm10, %xmm12 1796; SSSE3-NEXT: pand %xmm8, %xmm12 1797; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7] 1798; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] 1799; SSSE3-NEXT: pmullw %xmm4, %xmm0 1800; SSSE3-NEXT: movdqa %xmm0, %xmm11 1801; SSSE3-NEXT: pand %xmm8, %xmm11 1802; SSSE3-NEXT: packuswb %xmm12, %xmm11 1803; SSSE3-NEXT: movdqa %xmm5, %xmm4 1804; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1805; SSSE3-NEXT: movdqa %xmm1, %xmm13 1806; SSSE3-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15] 1807; SSSE3-NEXT: pmullw %xmm4, %xmm13 1808; SSSE3-NEXT: movdqa %xmm13, %xmm4 1809; SSSE3-NEXT: pand %xmm8, %xmm4 1810; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] 1811; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7] 1812; SSSE3-NEXT: pmullw %xmm5, %xmm1 1813; SSSE3-NEXT: movdqa %xmm1, %xmm12 1814; SSSE3-NEXT: pand %xmm8, %xmm12 1815; SSSE3-NEXT: packuswb %xmm4, %xmm12 1816; SSSE3-NEXT: movdqa %xmm6, %xmm4 1817; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1818; SSSE3-NEXT: movdqa %xmm2, %xmm5 1819; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] 1820; SSSE3-NEXT: pmullw %xmm4, %xmm5 1821; SSSE3-NEXT: movdqa %xmm5, %xmm4 1822; SSSE3-NEXT: pand %xmm8, %xmm4 1823; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] 1824; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7] 1825; SSSE3-NEXT: pmullw %xmm6, %xmm2 1826; SSSE3-NEXT: movdqa %xmm2, %xmm14 1827; SSSE3-NEXT: pand %xmm8, %xmm14 1828; SSSE3-NEXT: packuswb %xmm4, %xmm14 1829; SSSE3-NEXT: movdqa %xmm7, %xmm4 1830; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15] 1831; SSSE3-NEXT: movdqa %xmm3, %xmm6 1832; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15] 1833; SSSE3-NEXT: pmullw %xmm4, %xmm6 1834; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 1835; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] 1836; SSSE3-NEXT: pmullw %xmm7, %xmm3 1837; SSSE3-NEXT: movdqa %xmm6, %xmm4 1838; SSSE3-NEXT: pand %xmm8, %xmm4 1839; SSSE3-NEXT: pand %xmm3, %xmm8 1840; SSSE3-NEXT: packuswb %xmm4, %xmm8 1841; SSSE3-NEXT: psrlw $8, %xmm6 1842; SSSE3-NEXT: psrlw $8, %xmm3 1843; SSSE3-NEXT: packuswb %xmm6, %xmm3 1844; SSSE3-NEXT: psrlw $8, %xmm5 1845; SSSE3-NEXT: psrlw $8, %xmm2 1846; SSSE3-NEXT: packuswb %xmm5, %xmm2 1847; SSSE3-NEXT: psrlw $8, %xmm13 1848; SSSE3-NEXT: psrlw $8, %xmm1 1849; SSSE3-NEXT: packuswb %xmm13, %xmm1 1850; SSSE3-NEXT: psrlw $8, %xmm10 1851; SSSE3-NEXT: psrlw $8, %xmm0 1852; SSSE3-NEXT: packuswb %xmm10, %xmm0 1853; SSSE3-NEXT: pcmpeqb %xmm9, %xmm3 1854; SSSE3-NEXT: pcmpeqb %xmm9, %xmm2 1855; SSSE3-NEXT: pcmpeqb %xmm9, %xmm1 1856; SSSE3-NEXT: pcmpeqb %xmm9, %xmm0 1857; SSSE3-NEXT: pcmpeqd %xmm4, %xmm4 1858; SSSE3-NEXT: pxor %xmm4, %xmm3 1859; SSSE3-NEXT: pxor %xmm4, %xmm2 1860; SSSE3-NEXT: pxor %xmm4, %xmm1 1861; SSSE3-NEXT: pxor %xmm4, %xmm0 1862; SSSE3-NEXT: movdqa %xmm8, 48(%rsi) 1863; SSSE3-NEXT: movdqa %xmm14, 32(%rsi) 1864; SSSE3-NEXT: movdqa %xmm12, 16(%rsi) 1865; SSSE3-NEXT: movdqa %xmm3, %xmm4 1866; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1867; SSSE3-NEXT: movdqa %xmm11, (%rsi) 1868; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] 1869; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1870; SSSE3-NEXT: psrad $24, %xmm5 1871; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) 1872; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 1873; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1874; SSSE3-NEXT: psrad $24, %xmm5 1875; SSSE3-NEXT: movdqa %xmm5, 128(%rdi) 1876; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 1877; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1878; SSSE3-NEXT: psrad $24, %xmm5 1879; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) 1880; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] 1881; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] 1882; SSSE3-NEXT: psrad $24, %xmm5 1883; SSSE3-NEXT: movdqa %xmm5, (%rdi) 1884; SSSE3-NEXT: movdqa %xmm4, %xmm5 1885; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1886; SSSE3-NEXT: pslld $31, %xmm4 1887; SSSE3-NEXT: psrad $31, %xmm4 1888; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) 1889; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] 1890; SSSE3-NEXT: pslld $31, %xmm5 1891; SSSE3-NEXT: psrad $31, %xmm5 1892; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) 1893; SSSE3-NEXT: movdqa %xmm2, %xmm4 1894; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1895; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1896; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1897; SSSE3-NEXT: pslld $31, %xmm3 1898; SSSE3-NEXT: psrad $31, %xmm3 1899; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) 1900; SSSE3-NEXT: movdqa %xmm4, %xmm3 1901; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] 1902; SSSE3-NEXT: pslld $31, %xmm4 1903; SSSE3-NEXT: psrad $31, %xmm4 1904; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) 1905; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] 1906; SSSE3-NEXT: pslld $31, %xmm3 1907; SSSE3-NEXT: psrad $31, %xmm3 1908; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) 1909; SSSE3-NEXT: movdqa %xmm1, %xmm3 1910; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1911; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1912; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1913; SSSE3-NEXT: pslld $31, %xmm2 1914; SSSE3-NEXT: psrad $31, %xmm2 1915; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) 1916; SSSE3-NEXT: movdqa %xmm3, %xmm2 1917; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] 1918; SSSE3-NEXT: pslld $31, %xmm3 1919; SSSE3-NEXT: psrad $31, %xmm3 1920; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) 1921; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] 1922; SSSE3-NEXT: pslld $31, %xmm2 1923; SSSE3-NEXT: psrad $31, %xmm2 1924; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) 1925; SSSE3-NEXT: movdqa %xmm0, %xmm2 1926; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 1927; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1928; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1929; SSSE3-NEXT: pslld $31, %xmm1 1930; SSSE3-NEXT: psrad $31, %xmm1 1931; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) 1932; SSSE3-NEXT: movdqa %xmm2, %xmm1 1933; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] 1934; SSSE3-NEXT: pslld $31, %xmm2 1935; SSSE3-NEXT: psrad $31, %xmm2 1936; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) 1937; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 1938; SSSE3-NEXT: pslld $31, %xmm1 1939; SSSE3-NEXT: psrad $31, %xmm1 1940; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) 1941; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1942; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 1943; SSSE3-NEXT: pslld $31, %xmm0 1944; SSSE3-NEXT: psrad $31, %xmm0 1945; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) 1946; SSSE3-NEXT: retq 1947; 1948; SSE41-LABEL: umulo_v64i8: 1949; SSE41: # %bb.0: 1950; SSE41-NEXT: movq %rdi, %rax 1951; SSE41-NEXT: pxor %xmm13, %xmm13 1952; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 1953; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm13[8],xmm4[9],xmm13[9],xmm4[10],xmm13[10],xmm4[11],xmm13[11],xmm4[12],xmm13[12],xmm4[13],xmm13[13],xmm4[14],xmm13[14],xmm4[15],xmm13[15] 1954; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 1955; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm13[8],xmm0[9],xmm13[9],xmm0[10],xmm13[10],xmm0[11],xmm13[11],xmm0[12],xmm13[12],xmm0[13],xmm13[13],xmm0[14],xmm13[14],xmm0[15],xmm13[15] 1956; SSE41-NEXT: pmullw %xmm4, %xmm0 1957; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1958; SSE41-NEXT: movdqa %xmm0, %xmm4 1959; SSE41-NEXT: pand %xmm9, %xmm4 1960; SSE41-NEXT: pmullw %xmm10, %xmm8 1961; SSE41-NEXT: movdqa %xmm8, %xmm10 1962; SSE41-NEXT: pand %xmm9, %xmm10 1963; SSE41-NEXT: packuswb %xmm4, %xmm10 1964; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 1965; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm13[8],xmm5[9],xmm13[9],xmm5[10],xmm13[10],xmm5[11],xmm13[11],xmm5[12],xmm13[12],xmm5[13],xmm13[13],xmm5[14],xmm13[14],xmm5[15],xmm13[15] 1966; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 1967; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm13[8],xmm1[9],xmm13[9],xmm1[10],xmm13[10],xmm1[11],xmm13[11],xmm1[12],xmm13[12],xmm1[13],xmm13[13],xmm1[14],xmm13[14],xmm1[15],xmm13[15] 1968; SSE41-NEXT: pmullw %xmm5, %xmm1 1969; SSE41-NEXT: movdqa %xmm1, %xmm5 1970; SSE41-NEXT: pand %xmm9, %xmm5 1971; SSE41-NEXT: pmullw %xmm11, %xmm4 1972; SSE41-NEXT: movdqa %xmm4, %xmm11 1973; SSE41-NEXT: pand %xmm9, %xmm11 1974; SSE41-NEXT: packuswb %xmm5, %xmm11 1975; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 1976; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm13[8],xmm6[9],xmm13[9],xmm6[10],xmm13[10],xmm6[11],xmm13[11],xmm6[12],xmm13[12],xmm6[13],xmm13[13],xmm6[14],xmm13[14],xmm6[15],xmm13[15] 1977; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 1978; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15] 1979; SSE41-NEXT: pmullw %xmm6, %xmm2 1980; SSE41-NEXT: movdqa %xmm2, %xmm6 1981; SSE41-NEXT: pand %xmm9, %xmm6 1982; SSE41-NEXT: pmullw %xmm12, %xmm5 1983; SSE41-NEXT: movdqa %xmm5, %xmm12 1984; SSE41-NEXT: pand %xmm9, %xmm12 1985; SSE41-NEXT: packuswb %xmm6, %xmm12 1986; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 1987; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm13[8],xmm7[9],xmm13[9],xmm7[10],xmm13[10],xmm7[11],xmm13[11],xmm7[12],xmm13[12],xmm7[13],xmm13[13],xmm7[14],xmm13[14],xmm7[15],xmm13[15] 1988; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 1989; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15] 1990; SSE41-NEXT: pmullw %xmm7, %xmm3 1991; SSE41-NEXT: pmullw %xmm14, %xmm6 1992; SSE41-NEXT: movdqa %xmm3, %xmm7 1993; SSE41-NEXT: pand %xmm9, %xmm7 1994; SSE41-NEXT: pand %xmm6, %xmm9 1995; SSE41-NEXT: packuswb %xmm7, %xmm9 1996; SSE41-NEXT: psrlw $8, %xmm3 1997; SSE41-NEXT: psrlw $8, %xmm6 1998; SSE41-NEXT: packuswb %xmm3, %xmm6 1999; SSE41-NEXT: psrlw $8, %xmm2 2000; SSE41-NEXT: psrlw $8, %xmm5 2001; SSE41-NEXT: packuswb %xmm2, %xmm5 2002; SSE41-NEXT: psrlw $8, %xmm1 2003; SSE41-NEXT: psrlw $8, %xmm4 2004; SSE41-NEXT: packuswb %xmm1, %xmm4 2005; SSE41-NEXT: psrlw $8, %xmm0 2006; SSE41-NEXT: psrlw $8, %xmm8 2007; SSE41-NEXT: packuswb %xmm0, %xmm8 2008; SSE41-NEXT: pcmpeqb %xmm13, %xmm6 2009; SSE41-NEXT: pcmpeqb %xmm13, %xmm5 2010; SSE41-NEXT: pcmpeqb %xmm13, %xmm4 2011; SSE41-NEXT: pcmpeqb %xmm13, %xmm8 2012; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2013; SSE41-NEXT: pxor %xmm0, %xmm6 2014; SSE41-NEXT: pxor %xmm0, %xmm5 2015; SSE41-NEXT: pxor %xmm0, %xmm4 2016; SSE41-NEXT: pxor %xmm0, %xmm8 2017; SSE41-NEXT: movdqa %xmm9, 48(%rsi) 2018; SSE41-NEXT: movdqa %xmm12, 32(%rsi) 2019; SSE41-NEXT: movdqa %xmm11, 16(%rsi) 2020; SSE41-NEXT: movdqa %xmm10, (%rsi) 2021; SSE41-NEXT: pmovsxbd %xmm6, %xmm0 2022; SSE41-NEXT: movdqa %xmm0, 192(%rdi) 2023; SSE41-NEXT: pmovsxbd %xmm5, %xmm0 2024; SSE41-NEXT: movdqa %xmm0, 128(%rdi) 2025; SSE41-NEXT: pmovsxbd %xmm4, %xmm0 2026; SSE41-NEXT: movdqa %xmm0, 64(%rdi) 2027; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 2028; SSE41-NEXT: movdqa %xmm0, (%rdi) 2029; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2030; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2031; SSE41-NEXT: pslld $31, %xmm0 2032; SSE41-NEXT: psrad $31, %xmm0 2033; SSE41-NEXT: movdqa %xmm0, 224(%rdi) 2034; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2035; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2036; SSE41-NEXT: pslld $31, %xmm0 2037; SSE41-NEXT: psrad $31, %xmm0 2038; SSE41-NEXT: movdqa %xmm0, 240(%rdi) 2039; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2040; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2041; SSE41-NEXT: pslld $31, %xmm0 2042; SSE41-NEXT: psrad $31, %xmm0 2043; SSE41-NEXT: movdqa %xmm0, 208(%rdi) 2044; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2045; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2046; SSE41-NEXT: pslld $31, %xmm0 2047; SSE41-NEXT: psrad $31, %xmm0 2048; SSE41-NEXT: movdqa %xmm0, 160(%rdi) 2049; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2050; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2051; SSE41-NEXT: pslld $31, %xmm0 2052; SSE41-NEXT: psrad $31, %xmm0 2053; SSE41-NEXT: movdqa %xmm0, 176(%rdi) 2054; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2055; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2056; SSE41-NEXT: pslld $31, %xmm0 2057; SSE41-NEXT: psrad $31, %xmm0 2058; SSE41-NEXT: movdqa %xmm0, 144(%rdi) 2059; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2060; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2061; SSE41-NEXT: pslld $31, %xmm0 2062; SSE41-NEXT: psrad $31, %xmm0 2063; SSE41-NEXT: movdqa %xmm0, 96(%rdi) 2064; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 2065; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2066; SSE41-NEXT: pslld $31, %xmm0 2067; SSE41-NEXT: psrad $31, %xmm0 2068; SSE41-NEXT: movdqa %xmm0, 112(%rdi) 2069; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2070; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2071; SSE41-NEXT: pslld $31, %xmm0 2072; SSE41-NEXT: psrad $31, %xmm0 2073; SSE41-NEXT: movdqa %xmm0, 80(%rdi) 2074; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] 2075; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2076; SSE41-NEXT: pslld $31, %xmm0 2077; SSE41-NEXT: psrad $31, %xmm0 2078; SSE41-NEXT: movdqa %xmm0, 32(%rdi) 2079; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] 2080; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2081; SSE41-NEXT: pslld $31, %xmm0 2082; SSE41-NEXT: psrad $31, %xmm0 2083; SSE41-NEXT: movdqa %xmm0, 48(%rdi) 2084; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2085; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 2086; SSE41-NEXT: pslld $31, %xmm0 2087; SSE41-NEXT: psrad $31, %xmm0 2088; SSE41-NEXT: movdqa %xmm0, 16(%rdi) 2089; SSE41-NEXT: retq 2090; 2091; AVX1-LABEL: umulo_v64i8: 2092; AVX1: # %bb.0: 2093; AVX1-NEXT: movq %rdi, %rax 2094; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 2095; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2096; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2097; AVX1-NEXT: vpmullw %xmm4, %xmm6, %xmm9 2098; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 2099; AVX1-NEXT: vpand %xmm6, %xmm9, %xmm8 2100; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2101; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2102; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm11 2103; AVX1-NEXT: vpand %xmm6, %xmm11, %xmm4 2104; AVX1-NEXT: vpackuswb %xmm8, %xmm4, %xmm8 2105; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 2106; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] 2107; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2108; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] 2109; AVX1-NEXT: vpmullw %xmm4, %xmm7, %xmm12 2110; AVX1-NEXT: vpand %xmm6, %xmm12, %xmm7 2111; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 2112; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 2113; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm13 2114; AVX1-NEXT: vpand %xmm6, %xmm13, %xmm2 2115; AVX1-NEXT: vpackuswb %xmm7, %xmm2, %xmm10 2116; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2117; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2118; AVX1-NEXT: vpmullw %xmm2, %xmm7, %xmm7 2119; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 2120; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2121; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2122; AVX1-NEXT: vpmullw %xmm4, %xmm0, %xmm0 2123; AVX1-NEXT: vpand %xmm6, %xmm0, %xmm4 2124; AVX1-NEXT: vpackuswb %xmm2, %xmm4, %xmm14 2125; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 2126; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] 2127; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 2128; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] 2129; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 2130; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 2131; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 2132; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm3 2133; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm1 2134; AVX1-NEXT: vpand %xmm6, %xmm3, %xmm4 2135; AVX1-NEXT: vpackuswb %xmm1, %xmm4, %xmm15 2136; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 2137; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 2138; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2 2139; AVX1-NEXT: vpsrlw $8, %xmm7, %xmm3 2140; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 2141; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 2142; AVX1-NEXT: vpsrlw $8, %xmm12, %xmm3 2143; AVX1-NEXT: vpsrlw $8, %xmm13, %xmm4 2144; AVX1-NEXT: vpackuswb %xmm3, %xmm4, %xmm3 2145; AVX1-NEXT: vpsrlw $8, %xmm9, %xmm4 2146; AVX1-NEXT: vpsrlw $8, %xmm11, %xmm6 2147; AVX1-NEXT: vpackuswb %xmm4, %xmm6, %xmm4 2148; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm2 2149; AVX1-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 2150; AVX1-NEXT: vpcmpeqb %xmm5, %xmm3, %xmm3 2151; AVX1-NEXT: vpcmpeqb %xmm5, %xmm4, %xmm7 2152; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2153; AVX1-NEXT: vpxor %xmm1, %xmm2, %xmm6 2154; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm4 2155; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm5 2156; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm3 2157; AVX1-NEXT: vmovdqa %xmm15, 48(%rsi) 2158; AVX1-NEXT: vmovdqa %xmm14, 32(%rsi) 2159; AVX1-NEXT: vmovdqa %xmm10, 16(%rsi) 2160; AVX1-NEXT: vmovdqa %xmm8, (%rsi) 2161; AVX1-NEXT: vpmovsxbd %xmm6, %xmm0 2162; AVX1-NEXT: vmovdqa %xmm0, 192(%rdi) 2163; AVX1-NEXT: vpmovsxbd %xmm4, %xmm0 2164; AVX1-NEXT: vmovdqa %xmm0, 128(%rdi) 2165; AVX1-NEXT: vpmovsxbd %xmm5, %xmm0 2166; AVX1-NEXT: vmovdqa %xmm0, 64(%rdi) 2167; AVX1-NEXT: vpmovsxbd %xmm3, %xmm0 2168; AVX1-NEXT: vmovdqa %xmm0, (%rdi) 2169; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] 2170; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2171; AVX1-NEXT: vmovdqa %xmm0, 224(%rdi) 2172; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[3,3,3,3] 2173; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2174; AVX1-NEXT: vmovdqa %xmm0, 240(%rdi) 2175; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2176; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2177; AVX1-NEXT: vmovdqa %xmm0, 208(%rdi) 2178; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,3,2,3] 2179; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2180; AVX1-NEXT: vmovdqa %xmm0, 160(%rdi) 2181; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] 2182; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2183; AVX1-NEXT: vmovdqa %xmm0, 176(%rdi) 2184; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2185; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2186; AVX1-NEXT: vmovdqa %xmm0, 144(%rdi) 2187; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3] 2188; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2189; AVX1-NEXT: vmovdqa %xmm0, 96(%rdi) 2190; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,3,3,3] 2191; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2192; AVX1-NEXT: vmovdqa %xmm0, 112(%rdi) 2193; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2194; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2195; AVX1-NEXT: vmovdqa %xmm0, 80(%rdi) 2196; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,2,3] 2197; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2198; AVX1-NEXT: vmovdqa %xmm0, 32(%rdi) 2199; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[3,3,3,3] 2200; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2201; AVX1-NEXT: vmovdqa %xmm0, 48(%rdi) 2202; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 2203; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 2204; AVX1-NEXT: vmovdqa %xmm0, 16(%rdi) 2205; AVX1-NEXT: vzeroupper 2206; AVX1-NEXT: retq 2207; 2208; AVX2-LABEL: umulo_v64i8: 2209; AVX2: # %bb.0: 2210; AVX2-NEXT: movq %rdi, %rax 2211; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 2212; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] 2213; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] 2214; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 2215; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2216; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 2217; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] 2218; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] 2219; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm2 2220; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm0 2221; AVX2-NEXT: vpackuswb %ymm7, %ymm0, %ymm9 2222; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15],ymm3[24],ymm4[24],ymm3[25],ymm4[25],ymm3[26],ymm4[26],ymm3[27],ymm4[27],ymm3[28],ymm4[28],ymm3[29],ymm4[29],ymm3[30],ymm4[30],ymm3[31],ymm4[31] 2223; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15],ymm1[24],ymm4[24],ymm1[25],ymm4[25],ymm1[26],ymm4[26],ymm1[27],ymm4[27],ymm1[28],ymm4[28],ymm1[29],ymm4[29],ymm1[30],ymm4[30],ymm1[31],ymm4[31] 2224; AVX2-NEXT: vpmullw %ymm7, %ymm8, %ymm7 2225; AVX2-NEXT: vpand %ymm6, %ymm7, %ymm8 2226; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23] 2227; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[16],ymm4[16],ymm1[17],ymm4[17],ymm1[18],ymm4[18],ymm1[19],ymm4[19],ymm1[20],ymm4[20],ymm1[21],ymm4[21],ymm1[22],ymm4[22],ymm1[23],ymm4[23] 2228; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 2229; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm3 2230; AVX2-NEXT: vpackuswb %ymm8, %ymm3, %ymm8 2231; AVX2-NEXT: vpsrlw $8, %ymm7, %ymm6 2232; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 2233; AVX2-NEXT: vpackuswb %ymm6, %ymm1, %ymm1 2234; AVX2-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1 2235; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 2236; AVX2-NEXT: vpxor %ymm6, %ymm1, %ymm1 2237; AVX2-NEXT: vpsrlw $8, %ymm5, %ymm5 2238; AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 2239; AVX2-NEXT: vpackuswb %ymm5, %ymm2, %ymm2 2240; AVX2-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 2241; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 2242; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] 2243; AVX2-NEXT: vpmovsxbd %xmm4, %ymm4 2244; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 2245; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] 2246; AVX2-NEXT: vpmovsxbd %xmm6, %ymm6 2247; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] 2248; AVX2-NEXT: vpmovsxbd %xmm7, %ymm7 2249; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0 2250; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 2251; AVX2-NEXT: vpmovsxbd %xmm3, %ymm3 2252; AVX2-NEXT: vpmovsxbd %xmm2, %ymm2 2253; AVX2-NEXT: vpmovsxbd %xmm5, %ymm5 2254; AVX2-NEXT: vpmovsxbd %xmm1, %ymm1 2255; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 2256; AVX2-NEXT: vmovdqa %ymm8, 32(%rsi) 2257; AVX2-NEXT: vmovdqa %ymm9, (%rsi) 2258; AVX2-NEXT: vmovdqa %ymm0, 192(%rdi) 2259; AVX2-NEXT: vmovdqa %ymm1, 128(%rdi) 2260; AVX2-NEXT: vmovdqa %ymm5, 64(%rdi) 2261; AVX2-NEXT: vmovdqa %ymm2, (%rdi) 2262; AVX2-NEXT: vmovdqa %ymm3, 224(%rdi) 2263; AVX2-NEXT: vmovdqa %ymm7, 160(%rdi) 2264; AVX2-NEXT: vmovdqa %ymm6, 96(%rdi) 2265; AVX2-NEXT: vmovdqa %ymm4, 32(%rdi) 2266; AVX2-NEXT: vzeroupper 2267; AVX2-NEXT: retq 2268; 2269; AVX512F-LABEL: umulo_v64i8: 2270; AVX512F: # %bb.0: 2271; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 2272; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm3 2273; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2274; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5 2275; AVX512F-NEXT: vextracti128 $1, %ymm5, %xmm4 2276; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero 2277; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm4 2278; AVX512F-NEXT: vpsrlw $8, %ymm4, %ymm3 2279; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero 2280; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 2281; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2282; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero 2283; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm5 2284; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm2 2285; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2286; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k2 2287; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 2288; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 2289; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3 2290; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero 2291; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm6 2292; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm2 2293; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero 2294; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k3 2295; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 2296; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 2297; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm7 2298; AVX512F-NEXT: vpsrlw $8, %ymm7, %ymm0 2299; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 2300; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k4 2301; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z} 2302; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} 2303; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} 2304; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 2305; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero 2306; AVX512F-NEXT: vpmovdb %zmm4, 48(%rdi) 2307; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm5[0],zero,ymm5[1],zero,ymm5[2],zero,ymm5[3],zero,ymm5[4],zero,ymm5[5],zero,ymm5[6],zero,ymm5[7],zero,ymm5[8],zero,ymm5[9],zero,ymm5[10],zero,ymm5[11],zero,ymm5[12],zero,ymm5[13],zero,ymm5[14],zero,ymm5[15],zero 2308; AVX512F-NEXT: vpmovdb %zmm4, 32(%rdi) 2309; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm6[0],zero,ymm6[1],zero,ymm6[2],zero,ymm6[3],zero,ymm6[4],zero,ymm6[5],zero,ymm6[6],zero,ymm6[7],zero,ymm6[8],zero,ymm6[9],zero,ymm6[10],zero,ymm6[11],zero,ymm6[12],zero,ymm6[13],zero,ymm6[14],zero,ymm6[15],zero 2310; AVX512F-NEXT: vpmovdb %zmm4, 16(%rdi) 2311; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm7[0],zero,ymm7[1],zero,ymm7[2],zero,ymm7[3],zero,ymm7[4],zero,ymm7[5],zero,ymm7[6],zero,ymm7[7],zero,ymm7[8],zero,ymm7[9],zero,ymm7[10],zero,ymm7[11],zero,ymm7[12],zero,ymm7[13],zero,ymm7[14],zero,ymm7[15],zero 2312; AVX512F-NEXT: vpmovdb %zmm4, (%rdi) 2313; AVX512F-NEXT: retq 2314; 2315; AVX512BW-LABEL: umulo_v64i8: 2316; AVX512BW: # %bb.0: 2317; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 2318; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63] 2319; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm0[8],zmm2[8],zmm0[9],zmm2[9],zmm0[10],zmm2[10],zmm0[11],zmm2[11],zmm0[12],zmm2[12],zmm0[13],zmm2[13],zmm0[14],zmm2[14],zmm0[15],zmm2[15],zmm0[24],zmm2[24],zmm0[25],zmm2[25],zmm0[26],zmm2[26],zmm0[27],zmm2[27],zmm0[28],zmm2[28],zmm0[29],zmm2[29],zmm0[30],zmm2[30],zmm0[31],zmm2[31],zmm0[40],zmm2[40],zmm0[41],zmm2[41],zmm0[42],zmm2[42],zmm0[43],zmm2[43],zmm0[44],zmm2[44],zmm0[45],zmm2[45],zmm0[46],zmm2[46],zmm0[47],zmm2[47],zmm0[56],zmm2[56],zmm0[57],zmm2[57],zmm0[58],zmm2[58],zmm0[59],zmm2[59],zmm0[60],zmm2[60],zmm0[61],zmm2[61],zmm0[62],zmm2[62],zmm0[63],zmm2[63] 2320; AVX512BW-NEXT: vpmullw %zmm3, %zmm4, %zmm3 2321; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 2322; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm5 2323; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55] 2324; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm2[0],zmm0[1],zmm2[1],zmm0[2],zmm2[2],zmm0[3],zmm2[3],zmm0[4],zmm2[4],zmm0[5],zmm2[5],zmm0[6],zmm2[6],zmm0[7],zmm2[7],zmm0[16],zmm2[16],zmm0[17],zmm2[17],zmm0[18],zmm2[18],zmm0[19],zmm2[19],zmm0[20],zmm2[20],zmm0[21],zmm2[21],zmm0[22],zmm2[22],zmm0[23],zmm2[23],zmm0[32],zmm2[32],zmm0[33],zmm2[33],zmm0[34],zmm2[34],zmm0[35],zmm2[35],zmm0[36],zmm2[36],zmm0[37],zmm2[37],zmm0[38],zmm2[38],zmm0[39],zmm2[39],zmm0[48],zmm2[48],zmm0[49],zmm2[49],zmm0[50],zmm2[50],zmm0[51],zmm2[51],zmm0[52],zmm2[52],zmm0[53],zmm2[53],zmm0[54],zmm2[54],zmm0[55],zmm2[55] 2325; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 2326; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm1 2327; AVX512BW-NEXT: vpackuswb %zmm5, %zmm1, %zmm4 2328; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm1 2329; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0 2330; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 2331; AVX512BW-NEXT: vptestmb %zmm0, %zmm0, %k1 2332; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} 2333; AVX512BW-NEXT: kshiftrd $16, %k1, %k2 2334; AVX512BW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} 2335; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 2336; AVX512BW-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z} 2337; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 2338; AVX512BW-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} 2339; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rdi) 2340; AVX512BW-NEXT: retq 2341 %t = call {<64 x i8>, <64 x i1>} @llvm.umul.with.overflow.v64i8(<64 x i8> %a0, <64 x i8> %a1) 2342 %val = extractvalue {<64 x i8>, <64 x i1>} %t, 0 2343 %obit = extractvalue {<64 x i8>, <64 x i1>} %t, 1 2344 %res = sext <64 x i1> %obit to <64 x i32> 2345 store <64 x i8> %val, ptr %p2 2346 ret <64 x i32> %res 2347} 2348 2349define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, ptr %p2) nounwind { 2350; SSE2-LABEL: umulo_v8i16: 2351; SSE2: # %bb.0: 2352; SSE2-NEXT: movdqa %xmm0, %xmm2 2353; SSE2-NEXT: pmullw %xmm1, %xmm2 2354; SSE2-NEXT: pmulhuw %xmm0, %xmm1 2355; SSE2-NEXT: pxor %xmm0, %xmm0 2356; SSE2-NEXT: pcmpeqw %xmm0, %xmm1 2357; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 2358; SSE2-NEXT: pxor %xmm0, %xmm1 2359; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2360; SSE2-NEXT: psrad $16, %xmm0 2361; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2362; SSE2-NEXT: pslld $31, %xmm1 2363; SSE2-NEXT: psrad $31, %xmm1 2364; SSE2-NEXT: movdqa %xmm2, (%rdi) 2365; SSE2-NEXT: retq 2366; 2367; SSSE3-LABEL: umulo_v8i16: 2368; SSSE3: # %bb.0: 2369; SSSE3-NEXT: movdqa %xmm0, %xmm2 2370; SSSE3-NEXT: pmullw %xmm1, %xmm2 2371; SSSE3-NEXT: pmulhuw %xmm0, %xmm1 2372; SSSE3-NEXT: pxor %xmm0, %xmm0 2373; SSSE3-NEXT: pcmpeqw %xmm0, %xmm1 2374; SSSE3-NEXT: pcmpeqd %xmm0, %xmm0 2375; SSSE3-NEXT: pxor %xmm0, %xmm1 2376; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2377; SSSE3-NEXT: psrad $16, %xmm0 2378; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2379; SSSE3-NEXT: pslld $31, %xmm1 2380; SSSE3-NEXT: psrad $31, %xmm1 2381; SSSE3-NEXT: movdqa %xmm2, (%rdi) 2382; SSSE3-NEXT: retq 2383; 2384; SSE41-LABEL: umulo_v8i16: 2385; SSE41: # %bb.0: 2386; SSE41-NEXT: movdqa %xmm0, %xmm2 2387; SSE41-NEXT: pmullw %xmm1, %xmm2 2388; SSE41-NEXT: pmulhuw %xmm0, %xmm1 2389; SSE41-NEXT: pxor %xmm0, %xmm0 2390; SSE41-NEXT: pcmpeqw %xmm0, %xmm1 2391; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 2392; SSE41-NEXT: pxor %xmm0, %xmm1 2393; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 2394; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 2395; SSE41-NEXT: pslld $31, %xmm1 2396; SSE41-NEXT: psrad $31, %xmm1 2397; SSE41-NEXT: movdqa %xmm2, (%rdi) 2398; SSE41-NEXT: retq 2399; 2400; AVX1-LABEL: umulo_v8i16: 2401; AVX1: # %bb.0: 2402; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2403; AVX1-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2404; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 2405; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2406; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2407; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 2408; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 2409; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 2410; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 2411; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2412; AVX1-NEXT: vmovdqa %xmm2, (%rdi) 2413; AVX1-NEXT: retq 2414; 2415; AVX2-LABEL: umulo_v8i16: 2416; AVX2: # %bb.0: 2417; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2418; AVX2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2419; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 2420; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2421; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 2422; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 2423; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 2424; AVX2-NEXT: vmovdqa %xmm2, (%rdi) 2425; AVX2-NEXT: retq 2426; 2427; AVX512F-LABEL: umulo_v8i16: 2428; AVX512F: # %bb.0: 2429; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2430; AVX512F-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2431; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 2432; AVX512F-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 2433; AVX512F-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 2434; AVX512F-NEXT: vpmovsxwd %xmm0, %ymm0 2435; AVX512F-NEXT: vptestmd %ymm0, %ymm0, %k1 2436; AVX512F-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2437; AVX512F-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2438; AVX512F-NEXT: vmovdqa %xmm2, (%rdi) 2439; AVX512F-NEXT: retq 2440; 2441; AVX512BW-LABEL: umulo_v8i16: 2442; AVX512BW: # %bb.0: 2443; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm2 2444; AVX512BW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 2445; AVX512BW-NEXT: vptestmw %xmm0, %xmm0, %k1 2446; AVX512BW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 2447; AVX512BW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 2448; AVX512BW-NEXT: vmovdqa %xmm2, (%rdi) 2449; AVX512BW-NEXT: retq 2450 %t = call {<8 x i16>, <8 x i1>} @llvm.umul.with.overflow.v8i16(<8 x i16> %a0, <8 x i16> %a1) 2451 %val = extractvalue {<8 x i16>, <8 x i1>} %t, 0 2452 %obit = extractvalue {<8 x i16>, <8 x i1>} %t, 1 2453 %res = sext <8 x i1> %obit to <8 x i32> 2454 store <8 x i16> %val, ptr %p2 2455 ret <8 x i32> %res 2456} 2457 2458define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, ptr %p2) nounwind { 2459; SSE2-LABEL: umulo_v2i64: 2460; SSE2: # %bb.0: 2461; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2462; SSE2-NEXT: movq %xmm2, %r8 2463; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2464; SSE2-NEXT: movq %xmm2, %r10 2465; SSE2-NEXT: movq %xmm0, %rax 2466; SSE2-NEXT: movq %xmm1, %rdx 2467; SSE2-NEXT: xorl %ecx, %ecx 2468; SSE2-NEXT: mulq %rdx 2469; SSE2-NEXT: movq $-1, %r9 2470; SSE2-NEXT: movl $0, %esi 2471; SSE2-NEXT: cmovoq %r9, %rsi 2472; SSE2-NEXT: movq %rax, %xmm1 2473; SSE2-NEXT: movq %r8, %rax 2474; SSE2-NEXT: mulq %r10 2475; SSE2-NEXT: movq %rax, %xmm0 2476; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2477; SSE2-NEXT: movq %rsi, %xmm0 2478; SSE2-NEXT: cmovoq %r9, %rcx 2479; SSE2-NEXT: movq %rcx, %xmm2 2480; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2481; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2482; SSE2-NEXT: movdqa %xmm1, (%rdi) 2483; SSE2-NEXT: retq 2484; 2485; SSSE3-LABEL: umulo_v2i64: 2486; SSSE3: # %bb.0: 2487; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 2488; SSSE3-NEXT: movq %xmm2, %r8 2489; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3] 2490; SSSE3-NEXT: movq %xmm2, %r10 2491; SSSE3-NEXT: movq %xmm0, %rax 2492; SSSE3-NEXT: movq %xmm1, %rdx 2493; SSSE3-NEXT: xorl %ecx, %ecx 2494; SSSE3-NEXT: mulq %rdx 2495; SSSE3-NEXT: movq $-1, %r9 2496; SSSE3-NEXT: movl $0, %esi 2497; SSSE3-NEXT: cmovoq %r9, %rsi 2498; SSSE3-NEXT: movq %rax, %xmm1 2499; SSSE3-NEXT: movq %r8, %rax 2500; SSSE3-NEXT: mulq %r10 2501; SSSE3-NEXT: movq %rax, %xmm0 2502; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2503; SSSE3-NEXT: movq %rsi, %xmm0 2504; SSSE3-NEXT: cmovoq %r9, %rcx 2505; SSSE3-NEXT: movq %rcx, %xmm2 2506; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 2507; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2508; SSSE3-NEXT: movdqa %xmm1, (%rdi) 2509; SSSE3-NEXT: retq 2510; 2511; SSE41-LABEL: umulo_v2i64: 2512; SSE41: # %bb.0: 2513; SSE41-NEXT: movq %xmm0, %r10 2514; SSE41-NEXT: movq %xmm1, %r8 2515; SSE41-NEXT: pextrq $1, %xmm0, %rax 2516; SSE41-NEXT: pextrq $1, %xmm1, %rdx 2517; SSE41-NEXT: xorl %esi, %esi 2518; SSE41-NEXT: mulq %rdx 2519; SSE41-NEXT: movq $-1, %r9 2520; SSE41-NEXT: movl $0, %ecx 2521; SSE41-NEXT: cmovoq %r9, %rcx 2522; SSE41-NEXT: movq %rax, %xmm0 2523; SSE41-NEXT: movq %r10, %rax 2524; SSE41-NEXT: mulq %r8 2525; SSE41-NEXT: movq %rax, %xmm1 2526; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2527; SSE41-NEXT: movq %rcx, %xmm0 2528; SSE41-NEXT: cmovoq %r9, %rsi 2529; SSE41-NEXT: movq %rsi, %xmm2 2530; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2531; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2532; SSE41-NEXT: movdqa %xmm1, (%rdi) 2533; SSE41-NEXT: retq 2534; 2535; AVX-LABEL: umulo_v2i64: 2536; AVX: # %bb.0: 2537; AVX-NEXT: vmovq %xmm0, %r10 2538; AVX-NEXT: vmovq %xmm1, %r8 2539; AVX-NEXT: vpextrq $1, %xmm0, %rax 2540; AVX-NEXT: vpextrq $1, %xmm1, %rdx 2541; AVX-NEXT: xorl %esi, %esi 2542; AVX-NEXT: mulq %rdx 2543; AVX-NEXT: movq $-1, %r9 2544; AVX-NEXT: movl $0, %ecx 2545; AVX-NEXT: cmovoq %r9, %rcx 2546; AVX-NEXT: vmovq %rax, %xmm0 2547; AVX-NEXT: movq %r10, %rax 2548; AVX-NEXT: mulq %r8 2549; AVX-NEXT: vmovq %rax, %xmm1 2550; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2551; AVX-NEXT: vmovq %rcx, %xmm0 2552; AVX-NEXT: cmovoq %r9, %rsi 2553; AVX-NEXT: vmovq %rsi, %xmm2 2554; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 2555; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2556; AVX-NEXT: vmovdqa %xmm1, (%rdi) 2557; AVX-NEXT: retq 2558; 2559; AVX512F-LABEL: umulo_v2i64: 2560; AVX512F: # %bb.0: 2561; AVX512F-NEXT: vmovq %xmm0, %rcx 2562; AVX512F-NEXT: vmovq %xmm1, %rsi 2563; AVX512F-NEXT: vpextrq $1, %xmm0, %rax 2564; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx 2565; AVX512F-NEXT: mulq %rdx 2566; AVX512F-NEXT: seto %r8b 2567; AVX512F-NEXT: vmovq %rax, %xmm0 2568; AVX512F-NEXT: movq %rcx, %rax 2569; AVX512F-NEXT: mulq %rsi 2570; AVX512F-NEXT: vmovq %rax, %xmm1 2571; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2572; AVX512F-NEXT: seto %al 2573; AVX512F-NEXT: andl $1, %eax 2574; AVX512F-NEXT: kmovw %eax, %k0 2575; AVX512F-NEXT: kmovw %r8d, %k1 2576; AVX512F-NEXT: kshiftlw $15, %k1, %k1 2577; AVX512F-NEXT: kshiftrw $14, %k1, %k1 2578; AVX512F-NEXT: korw %k1, %k0, %k1 2579; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2580; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2581; AVX512F-NEXT: vmovdqa %xmm1, (%rdi) 2582; AVX512F-NEXT: retq 2583; 2584; AVX512BW-LABEL: umulo_v2i64: 2585; AVX512BW: # %bb.0: 2586; AVX512BW-NEXT: vmovq %xmm0, %rcx 2587; AVX512BW-NEXT: vmovq %xmm1, %rsi 2588; AVX512BW-NEXT: vpextrq $1, %xmm0, %rax 2589; AVX512BW-NEXT: vpextrq $1, %xmm1, %rdx 2590; AVX512BW-NEXT: mulq %rdx 2591; AVX512BW-NEXT: seto %r8b 2592; AVX512BW-NEXT: vmovq %rax, %xmm0 2593; AVX512BW-NEXT: movq %rcx, %rax 2594; AVX512BW-NEXT: mulq %rsi 2595; AVX512BW-NEXT: vmovq %rax, %xmm1 2596; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] 2597; AVX512BW-NEXT: seto %al 2598; AVX512BW-NEXT: andl $1, %eax 2599; AVX512BW-NEXT: kmovw %eax, %k0 2600; AVX512BW-NEXT: kmovd %r8d, %k1 2601; AVX512BW-NEXT: kshiftlw $15, %k1, %k1 2602; AVX512BW-NEXT: kshiftrw $14, %k1, %k1 2603; AVX512BW-NEXT: korw %k1, %k0, %k1 2604; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2605; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2606; AVX512BW-NEXT: vmovdqa %xmm1, (%rdi) 2607; AVX512BW-NEXT: retq 2608 %t = call {<2 x i64>, <2 x i1>} @llvm.umul.with.overflow.v2i64(<2 x i64> %a0, <2 x i64> %a1) 2609 %val = extractvalue {<2 x i64>, <2 x i1>} %t, 0 2610 %obit = extractvalue {<2 x i64>, <2 x i1>} %t, 1 2611 %res = sext <2 x i1> %obit to <2 x i32> 2612 store <2 x i64> %val, ptr %p2 2613 ret <2 x i32> %res 2614} 2615 2616define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { 2617; SSE2-LABEL: umulo_v4i24: 2618; SSE2: # %bb.0: 2619; SSE2-NEXT: movdqa %xmm0, %xmm2 2620; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2621; SSE2-NEXT: pand %xmm0, %xmm1 2622; SSE2-NEXT: pand %xmm0, %xmm2 2623; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2624; SSE2-NEXT: pmuludq %xmm1, %xmm2 2625; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2626; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2627; SSE2-NEXT: pmuludq %xmm0, %xmm1 2628; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2629; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2630; SSE2-NEXT: pxor %xmm4, %xmm4 2631; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 2632; SSE2-NEXT: pcmpeqd %xmm5, %xmm5 2633; SSE2-NEXT: pxor %xmm3, %xmm5 2634; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2635; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2636; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2637; SSE2-NEXT: psrld $24, %xmm0 2638; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 2639; SSE2-NEXT: por %xmm5, %xmm0 2640; SSE2-NEXT: movd %xmm2, %eax 2641; SSE2-NEXT: movw %ax, (%rdi) 2642; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2643; SSE2-NEXT: movd %xmm2, %ecx 2644; SSE2-NEXT: movw %cx, 6(%rdi) 2645; SSE2-NEXT: movd %xmm1, %edx 2646; SSE2-NEXT: movw %dx, 3(%rdi) 2647; SSE2-NEXT: shrl $16, %eax 2648; SSE2-NEXT: movb %al, 2(%rdi) 2649; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2650; SSE2-NEXT: movd %xmm1, %eax 2651; SSE2-NEXT: movw %ax, 9(%rdi) 2652; SSE2-NEXT: shrl $16, %ecx 2653; SSE2-NEXT: movb %cl, 8(%rdi) 2654; SSE2-NEXT: shrl $16, %edx 2655; SSE2-NEXT: movb %dl, 5(%rdi) 2656; SSE2-NEXT: shrl $16, %eax 2657; SSE2-NEXT: movb %al, 11(%rdi) 2658; SSE2-NEXT: retq 2659; 2660; SSSE3-LABEL: umulo_v4i24: 2661; SSSE3: # %bb.0: 2662; SSSE3-NEXT: movdqa %xmm0, %xmm2 2663; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2664; SSSE3-NEXT: pand %xmm0, %xmm1 2665; SSSE3-NEXT: pand %xmm0, %xmm2 2666; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] 2667; SSSE3-NEXT: pmuludq %xmm1, %xmm2 2668; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,3,2,3] 2669; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 2670; SSSE3-NEXT: pmuludq %xmm0, %xmm1 2671; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3] 2672; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2673; SSSE3-NEXT: pxor %xmm4, %xmm4 2674; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 2675; SSSE3-NEXT: pcmpeqd %xmm5, %xmm5 2676; SSSE3-NEXT: pxor %xmm3, %xmm5 2677; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] 2678; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3] 2679; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 2680; SSSE3-NEXT: psrld $24, %xmm0 2681; SSSE3-NEXT: pcmpgtd %xmm4, %xmm0 2682; SSSE3-NEXT: por %xmm5, %xmm0 2683; SSSE3-NEXT: movd %xmm2, %eax 2684; SSSE3-NEXT: movw %ax, (%rdi) 2685; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] 2686; SSSE3-NEXT: movd %xmm2, %ecx 2687; SSSE3-NEXT: movw %cx, 6(%rdi) 2688; SSSE3-NEXT: movd %xmm1, %edx 2689; SSSE3-NEXT: movw %dx, 3(%rdi) 2690; SSSE3-NEXT: shrl $16, %eax 2691; SSSE3-NEXT: movb %al, 2(%rdi) 2692; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] 2693; SSSE3-NEXT: movd %xmm1, %eax 2694; SSSE3-NEXT: movw %ax, 9(%rdi) 2695; SSSE3-NEXT: shrl $16, %ecx 2696; SSSE3-NEXT: movb %cl, 8(%rdi) 2697; SSSE3-NEXT: shrl $16, %edx 2698; SSSE3-NEXT: movb %dl, 5(%rdi) 2699; SSSE3-NEXT: shrl $16, %eax 2700; SSSE3-NEXT: movb %al, 11(%rdi) 2701; SSSE3-NEXT: retq 2702; 2703; SSE41-LABEL: umulo_v4i24: 2704; SSE41: # %bb.0: 2705; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] 2706; SSE41-NEXT: pand %xmm2, %xmm0 2707; SSE41-NEXT: pand %xmm2, %xmm1 2708; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2709; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2710; SSE41-NEXT: pmuludq %xmm2, %xmm3 2711; SSE41-NEXT: movdqa %xmm0, %xmm2 2712; SSE41-NEXT: pmuludq %xmm1, %xmm2 2713; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 2714; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 2715; SSE41-NEXT: pxor %xmm3, %xmm3 2716; SSE41-NEXT: pcmpeqd %xmm3, %xmm2 2717; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 2718; SSE41-NEXT: pxor %xmm2, %xmm4 2719; SSE41-NEXT: pmulld %xmm0, %xmm1 2720; SSE41-NEXT: pextrd $3, %xmm1, %eax 2721; SSE41-NEXT: pextrd $2, %xmm1, %ecx 2722; SSE41-NEXT: pextrd $1, %xmm1, %edx 2723; SSE41-NEXT: movd %xmm1, %esi 2724; SSE41-NEXT: movdqa %xmm1, %xmm0 2725; SSE41-NEXT: psrld $24, %xmm0 2726; SSE41-NEXT: pcmpgtd %xmm3, %xmm0 2727; SSE41-NEXT: por %xmm4, %xmm0 2728; SSE41-NEXT: movw %ax, 9(%rdi) 2729; SSE41-NEXT: movw %cx, 6(%rdi) 2730; SSE41-NEXT: movw %dx, 3(%rdi) 2731; SSE41-NEXT: movw %si, (%rdi) 2732; SSE41-NEXT: shrl $16, %eax 2733; SSE41-NEXT: movb %al, 11(%rdi) 2734; SSE41-NEXT: shrl $16, %ecx 2735; SSE41-NEXT: movb %cl, 8(%rdi) 2736; SSE41-NEXT: shrl $16, %edx 2737; SSE41-NEXT: movb %dl, 5(%rdi) 2738; SSE41-NEXT: shrl $16, %esi 2739; SSE41-NEXT: movb %sil, 2(%rdi) 2740; SSE41-NEXT: retq 2741; 2742; AVX1-LABEL: umulo_v4i24: 2743; AVX1: # %bb.0: 2744; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2745; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 2746; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 2747; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3] 2748; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3] 2749; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2750; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2751; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2752; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] 2753; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 2754; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2755; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2756; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 2757; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2758; AVX1-NEXT: vpsrld $24, %xmm1, %xmm0 2759; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2760; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 2761; AVX1-NEXT: vpextrd $3, %xmm1, %eax 2762; AVX1-NEXT: movw %ax, 9(%rdi) 2763; AVX1-NEXT: vpextrd $2, %xmm1, %ecx 2764; AVX1-NEXT: movw %cx, 6(%rdi) 2765; AVX1-NEXT: vpextrd $1, %xmm1, %edx 2766; AVX1-NEXT: movw %dx, 3(%rdi) 2767; AVX1-NEXT: vmovd %xmm1, %esi 2768; AVX1-NEXT: movw %si, (%rdi) 2769; AVX1-NEXT: shrl $16, %eax 2770; AVX1-NEXT: movb %al, 11(%rdi) 2771; AVX1-NEXT: shrl $16, %ecx 2772; AVX1-NEXT: movb %cl, 8(%rdi) 2773; AVX1-NEXT: shrl $16, %edx 2774; AVX1-NEXT: movb %dl, 5(%rdi) 2775; AVX1-NEXT: shrl $16, %esi 2776; AVX1-NEXT: movb %sil, 2(%rdi) 2777; AVX1-NEXT: retq 2778; 2779; AVX2-LABEL: umulo_v4i24: 2780; AVX2: # %bb.0: 2781; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2782; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 2783; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 2784; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 2785; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 2786; AVX2-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 2787; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 2788; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 2789; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 2790; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 2791; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 2792; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 2793; AVX2-NEXT: vpxor %xmm4, %xmm2, %xmm2 2794; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2795; AVX2-NEXT: vpsrld $24, %xmm1, %xmm0 2796; AVX2-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0 2797; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 2798; AVX2-NEXT: vpextrd $3, %xmm1, %eax 2799; AVX2-NEXT: movw %ax, 9(%rdi) 2800; AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2801; AVX2-NEXT: movw %cx, 6(%rdi) 2802; AVX2-NEXT: vpextrd $1, %xmm1, %edx 2803; AVX2-NEXT: movw %dx, 3(%rdi) 2804; AVX2-NEXT: vmovd %xmm1, %esi 2805; AVX2-NEXT: movw %si, (%rdi) 2806; AVX2-NEXT: shrl $16, %eax 2807; AVX2-NEXT: movb %al, 11(%rdi) 2808; AVX2-NEXT: shrl $16, %ecx 2809; AVX2-NEXT: movb %cl, 8(%rdi) 2810; AVX2-NEXT: shrl $16, %edx 2811; AVX2-NEXT: movb %dl, 5(%rdi) 2812; AVX2-NEXT: shrl $16, %esi 2813; AVX2-NEXT: movb %sil, 2(%rdi) 2814; AVX2-NEXT: retq 2815; 2816; AVX512-LABEL: umulo_v4i24: 2817; AVX512: # %bb.0: 2818; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] 2819; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 2820; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 2821; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 2822; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 2823; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 2824; AVX512-NEXT: vpmuludq %xmm3, %xmm4, %xmm3 2825; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [1,5,3,7] 2826; AVX512-NEXT: vpermi2d %xmm3, %xmm2, %xmm4 2827; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm1 2828; AVX512-NEXT: vpsrld $24, %xmm1, %xmm0 2829; AVX512-NEXT: vpor %xmm4, %xmm0, %xmm0 2830; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 2831; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 2832; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 2833; AVX512-NEXT: vpextrd $3, %xmm1, %eax 2834; AVX512-NEXT: movw %ax, 9(%rdi) 2835; AVX512-NEXT: vpextrd $2, %xmm1, %ecx 2836; AVX512-NEXT: movw %cx, 6(%rdi) 2837; AVX512-NEXT: vpextrd $1, %xmm1, %edx 2838; AVX512-NEXT: movw %dx, 3(%rdi) 2839; AVX512-NEXT: vmovd %xmm1, %esi 2840; AVX512-NEXT: movw %si, (%rdi) 2841; AVX512-NEXT: shrl $16, %eax 2842; AVX512-NEXT: movb %al, 11(%rdi) 2843; AVX512-NEXT: shrl $16, %ecx 2844; AVX512-NEXT: movb %cl, 8(%rdi) 2845; AVX512-NEXT: shrl $16, %edx 2846; AVX512-NEXT: movb %dl, 5(%rdi) 2847; AVX512-NEXT: shrl $16, %esi 2848; AVX512-NEXT: movb %sil, 2(%rdi) 2849; AVX512-NEXT: retq 2850 %t = call {<4 x i24>, <4 x i1>} @llvm.umul.with.overflow.v4i24(<4 x i24> %a0, <4 x i24> %a1) 2851 %val = extractvalue {<4 x i24>, <4 x i1>} %t, 0 2852 %obit = extractvalue {<4 x i24>, <4 x i1>} %t, 1 2853 %res = sext <4 x i1> %obit to <4 x i32> 2854 store <4 x i24> %val, ptr %p2 2855 ret <4 x i32> %res 2856} 2857 2858define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { 2859; SSE-LABEL: umulo_v4i1: 2860; SSE: # %bb.0: 2861; SSE-NEXT: pand %xmm1, %xmm0 2862; SSE-NEXT: pslld $31, %xmm0 2863; SSE-NEXT: movmskps %xmm0, %eax 2864; SSE-NEXT: movb %al, (%rdi) 2865; SSE-NEXT: xorps %xmm0, %xmm0 2866; SSE-NEXT: retq 2867; 2868; AVX-LABEL: umulo_v4i1: 2869; AVX: # %bb.0: 2870; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 2871; AVX-NEXT: vpslld $31, %xmm0, %xmm0 2872; AVX-NEXT: vmovmskps %xmm0, %eax 2873; AVX-NEXT: movb %al, (%rdi) 2874; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 2875; AVX-NEXT: retq 2876; 2877; AVX512F-LABEL: umulo_v4i1: 2878; AVX512F: # %bb.0: 2879; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0 2880; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 2881; AVX512F-NEXT: vptestmd %xmm0, %xmm0, %k0 2882; AVX512F-NEXT: kmovw %k0, %eax 2883; AVX512F-NEXT: movb %al, (%rdi) 2884; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 2885; AVX512F-NEXT: retq 2886; 2887; AVX512BW-LABEL: umulo_v4i1: 2888; AVX512BW: # %bb.0: 2889; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm0 2890; AVX512BW-NEXT: vpslld $31, %xmm0, %xmm0 2891; AVX512BW-NEXT: vptestmd %xmm0, %xmm0, %k0 2892; AVX512BW-NEXT: kmovd %k0, %eax 2893; AVX512BW-NEXT: movb %al, (%rdi) 2894; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 2895; AVX512BW-NEXT: retq 2896 %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) 2897 %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 2898 %obit = extractvalue {<4 x i1>, <4 x i1>} %t, 1 2899 %res = sext <4 x i1> %obit to <4 x i32> 2900 store <4 x i1> %val, ptr %p2 2901 ret <4 x i32> %res 2902} 2903 2904define <2 x i32> @umulo_v2i128(<2 x i128> %a0, <2 x i128> %a1, ptr %p2) nounwind { 2905; SSE2-LABEL: umulo_v2i128: 2906; SSE2: # %bb.0: 2907; SSE2-NEXT: pushq %rbp 2908; SSE2-NEXT: pushq %r15 2909; SSE2-NEXT: pushq %r14 2910; SSE2-NEXT: pushq %r13 2911; SSE2-NEXT: pushq %r12 2912; SSE2-NEXT: pushq %rbx 2913; SSE2-NEXT: movq %r9, %r10 2914; SSE2-NEXT: movq %rcx, %r12 2915; SSE2-NEXT: movq %rdx, %r11 2916; SSE2-NEXT: movq %rsi, %rax 2917; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 2918; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 2919; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r9 2920; SSE2-NEXT: testq %r10, %r10 2921; SSE2-NEXT: setne %dl 2922; SSE2-NEXT: testq %rsi, %rsi 2923; SSE2-NEXT: setne %bpl 2924; SSE2-NEXT: andb %dl, %bpl 2925; SSE2-NEXT: mulq %r8 2926; SSE2-NEXT: movq %rax, %rsi 2927; SSE2-NEXT: seto %bl 2928; SSE2-NEXT: movq %r10, %rax 2929; SSE2-NEXT: mulq %rdi 2930; SSE2-NEXT: seto %cl 2931; SSE2-NEXT: orb %bl, %cl 2932; SSE2-NEXT: leaq (%rsi,%rax), %rbx 2933; SSE2-NEXT: movq %rdi, %rax 2934; SSE2-NEXT: mulq %r8 2935; SSE2-NEXT: movq %rax, %rdi 2936; SSE2-NEXT: movq %rdx, %rsi 2937; SSE2-NEXT: addq %rbx, %rsi 2938; SSE2-NEXT: setb %r13b 2939; SSE2-NEXT: orb %cl, %r13b 2940; SSE2-NEXT: orb %bpl, %r13b 2941; SSE2-NEXT: testq %r9, %r9 2942; SSE2-NEXT: setne %al 2943; SSE2-NEXT: testq %r12, %r12 2944; SSE2-NEXT: setne %r10b 2945; SSE2-NEXT: andb %al, %r10b 2946; SSE2-NEXT: movq %r12, %rax 2947; SSE2-NEXT: mulq %r14 2948; SSE2-NEXT: movq %rax, %rbp 2949; SSE2-NEXT: seto %r8b 2950; SSE2-NEXT: movq %r9, %rax 2951; SSE2-NEXT: mulq %r11 2952; SSE2-NEXT: seto %cl 2953; SSE2-NEXT: orb %r8b, %cl 2954; SSE2-NEXT: addq %rax, %rbp 2955; SSE2-NEXT: movq %r11, %rax 2956; SSE2-NEXT: mulq %r14 2957; SSE2-NEXT: addq %rbp, %rdx 2958; SSE2-NEXT: setb %bl 2959; SSE2-NEXT: orb %cl, %bl 2960; SSE2-NEXT: orb %r10b, %bl 2961; SSE2-NEXT: movzbl %bl, %ecx 2962; SSE2-NEXT: negl %ecx 2963; SSE2-NEXT: movd %ecx, %xmm1 2964; SSE2-NEXT: movzbl %r13b, %ecx 2965; SSE2-NEXT: negl %ecx 2966; SSE2-NEXT: movd %ecx, %xmm0 2967; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2968; SSE2-NEXT: movq %rax, 16(%r15) 2969; SSE2-NEXT: movq %rdi, (%r15) 2970; SSE2-NEXT: movq %rdx, 24(%r15) 2971; SSE2-NEXT: movq %rsi, 8(%r15) 2972; SSE2-NEXT: popq %rbx 2973; SSE2-NEXT: popq %r12 2974; SSE2-NEXT: popq %r13 2975; SSE2-NEXT: popq %r14 2976; SSE2-NEXT: popq %r15 2977; SSE2-NEXT: popq %rbp 2978; SSE2-NEXT: retq 2979; 2980; SSSE3-LABEL: umulo_v2i128: 2981; SSSE3: # %bb.0: 2982; SSSE3-NEXT: pushq %rbp 2983; SSSE3-NEXT: pushq %r15 2984; SSSE3-NEXT: pushq %r14 2985; SSSE3-NEXT: pushq %r13 2986; SSSE3-NEXT: pushq %r12 2987; SSSE3-NEXT: pushq %rbx 2988; SSSE3-NEXT: movq %r9, %r10 2989; SSSE3-NEXT: movq %rcx, %r12 2990; SSSE3-NEXT: movq %rdx, %r11 2991; SSSE3-NEXT: movq %rsi, %rax 2992; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r15 2993; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r14 2994; SSSE3-NEXT: movq {{[0-9]+}}(%rsp), %r9 2995; SSSE3-NEXT: testq %r10, %r10 2996; SSSE3-NEXT: setne %dl 2997; SSSE3-NEXT: testq %rsi, %rsi 2998; SSSE3-NEXT: setne %bpl 2999; SSSE3-NEXT: andb %dl, %bpl 3000; SSSE3-NEXT: mulq %r8 3001; SSSE3-NEXT: movq %rax, %rsi 3002; SSSE3-NEXT: seto %bl 3003; SSSE3-NEXT: movq %r10, %rax 3004; SSSE3-NEXT: mulq %rdi 3005; SSSE3-NEXT: seto %cl 3006; SSSE3-NEXT: orb %bl, %cl 3007; SSSE3-NEXT: leaq (%rsi,%rax), %rbx 3008; SSSE3-NEXT: movq %rdi, %rax 3009; SSSE3-NEXT: mulq %r8 3010; SSSE3-NEXT: movq %rax, %rdi 3011; SSSE3-NEXT: movq %rdx, %rsi 3012; SSSE3-NEXT: addq %rbx, %rsi 3013; SSSE3-NEXT: setb %r13b 3014; SSSE3-NEXT: orb %cl, %r13b 3015; SSSE3-NEXT: orb %bpl, %r13b 3016; SSSE3-NEXT: testq %r9, %r9 3017; SSSE3-NEXT: setne %al 3018; SSSE3-NEXT: testq %r12, %r12 3019; SSSE3-NEXT: setne %r10b 3020; SSSE3-NEXT: andb %al, %r10b 3021; SSSE3-NEXT: movq %r12, %rax 3022; SSSE3-NEXT: mulq %r14 3023; SSSE3-NEXT: movq %rax, %rbp 3024; SSSE3-NEXT: seto %r8b 3025; SSSE3-NEXT: movq %r9, %rax 3026; SSSE3-NEXT: mulq %r11 3027; SSSE3-NEXT: seto %cl 3028; SSSE3-NEXT: orb %r8b, %cl 3029; SSSE3-NEXT: addq %rax, %rbp 3030; SSSE3-NEXT: movq %r11, %rax 3031; SSSE3-NEXT: mulq %r14 3032; SSSE3-NEXT: addq %rbp, %rdx 3033; SSSE3-NEXT: setb %bl 3034; SSSE3-NEXT: orb %cl, %bl 3035; SSSE3-NEXT: orb %r10b, %bl 3036; SSSE3-NEXT: movzbl %bl, %ecx 3037; SSSE3-NEXT: negl %ecx 3038; SSSE3-NEXT: movd %ecx, %xmm1 3039; SSSE3-NEXT: movzbl %r13b, %ecx 3040; SSSE3-NEXT: negl %ecx 3041; SSSE3-NEXT: movd %ecx, %xmm0 3042; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 3043; SSSE3-NEXT: movq %rax, 16(%r15) 3044; SSSE3-NEXT: movq %rdi, (%r15) 3045; SSSE3-NEXT: movq %rdx, 24(%r15) 3046; SSSE3-NEXT: movq %rsi, 8(%r15) 3047; SSSE3-NEXT: popq %rbx 3048; SSSE3-NEXT: popq %r12 3049; SSSE3-NEXT: popq %r13 3050; SSSE3-NEXT: popq %r14 3051; SSSE3-NEXT: popq %r15 3052; SSSE3-NEXT: popq %rbp 3053; SSSE3-NEXT: retq 3054; 3055; SSE41-LABEL: umulo_v2i128: 3056; SSE41: # %bb.0: 3057; SSE41-NEXT: pushq %rbp 3058; SSE41-NEXT: pushq %r15 3059; SSE41-NEXT: pushq %r14 3060; SSE41-NEXT: pushq %r13 3061; SSE41-NEXT: pushq %r12 3062; SSE41-NEXT: pushq %rbx 3063; SSE41-NEXT: movq %r9, %r10 3064; SSE41-NEXT: movq %rcx, %r12 3065; SSE41-NEXT: movq %rdx, %r11 3066; SSE41-NEXT: movq %rsi, %rax 3067; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r15 3068; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r14 3069; SSE41-NEXT: movq {{[0-9]+}}(%rsp), %r9 3070; SSE41-NEXT: testq %r10, %r10 3071; SSE41-NEXT: setne %dl 3072; SSE41-NEXT: testq %rsi, %rsi 3073; SSE41-NEXT: setne %bpl 3074; SSE41-NEXT: andb %dl, %bpl 3075; SSE41-NEXT: mulq %r8 3076; SSE41-NEXT: movq %rax, %rsi 3077; SSE41-NEXT: seto %bl 3078; SSE41-NEXT: movq %r10, %rax 3079; SSE41-NEXT: mulq %rdi 3080; SSE41-NEXT: seto %cl 3081; SSE41-NEXT: orb %bl, %cl 3082; SSE41-NEXT: leaq (%rsi,%rax), %rbx 3083; SSE41-NEXT: movq %rdi, %rax 3084; SSE41-NEXT: mulq %r8 3085; SSE41-NEXT: movq %rax, %rdi 3086; SSE41-NEXT: movq %rdx, %rsi 3087; SSE41-NEXT: addq %rbx, %rsi 3088; SSE41-NEXT: setb %r13b 3089; SSE41-NEXT: orb %cl, %r13b 3090; SSE41-NEXT: orb %bpl, %r13b 3091; SSE41-NEXT: testq %r9, %r9 3092; SSE41-NEXT: setne %al 3093; SSE41-NEXT: testq %r12, %r12 3094; SSE41-NEXT: setne %r10b 3095; SSE41-NEXT: andb %al, %r10b 3096; SSE41-NEXT: movq %r12, %rax 3097; SSE41-NEXT: mulq %r14 3098; SSE41-NEXT: movq %rax, %rbp 3099; SSE41-NEXT: seto %r8b 3100; SSE41-NEXT: movq %r9, %rax 3101; SSE41-NEXT: mulq %r11 3102; SSE41-NEXT: seto %cl 3103; SSE41-NEXT: orb %r8b, %cl 3104; SSE41-NEXT: addq %rax, %rbp 3105; SSE41-NEXT: movq %r11, %rax 3106; SSE41-NEXT: mulq %r14 3107; SSE41-NEXT: addq %rbp, %rdx 3108; SSE41-NEXT: setb %bl 3109; SSE41-NEXT: orb %cl, %bl 3110; SSE41-NEXT: orb %r10b, %bl 3111; SSE41-NEXT: movzbl %bl, %ecx 3112; SSE41-NEXT: negl %ecx 3113; SSE41-NEXT: movzbl %r13b, %ebp 3114; SSE41-NEXT: negl %ebp 3115; SSE41-NEXT: movd %ebp, %xmm0 3116; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 3117; SSE41-NEXT: movq %rax, 16(%r15) 3118; SSE41-NEXT: movq %rdi, (%r15) 3119; SSE41-NEXT: movq %rdx, 24(%r15) 3120; SSE41-NEXT: movq %rsi, 8(%r15) 3121; SSE41-NEXT: popq %rbx 3122; SSE41-NEXT: popq %r12 3123; SSE41-NEXT: popq %r13 3124; SSE41-NEXT: popq %r14 3125; SSE41-NEXT: popq %r15 3126; SSE41-NEXT: popq %rbp 3127; SSE41-NEXT: retq 3128; 3129; AVX-LABEL: umulo_v2i128: 3130; AVX: # %bb.0: 3131; AVX-NEXT: pushq %rbp 3132; AVX-NEXT: pushq %r15 3133; AVX-NEXT: pushq %r14 3134; AVX-NEXT: pushq %r13 3135; AVX-NEXT: pushq %r12 3136; AVX-NEXT: pushq %rbx 3137; AVX-NEXT: movq %r9, %r10 3138; AVX-NEXT: movq %rcx, %r12 3139; AVX-NEXT: movq %rdx, %r11 3140; AVX-NEXT: movq %rsi, %rax 3141; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r15 3142; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r14 3143; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r9 3144; AVX-NEXT: testq %r10, %r10 3145; AVX-NEXT: setne %dl 3146; AVX-NEXT: testq %rsi, %rsi 3147; AVX-NEXT: setne %bpl 3148; AVX-NEXT: andb %dl, %bpl 3149; AVX-NEXT: mulq %r8 3150; AVX-NEXT: movq %rax, %rsi 3151; AVX-NEXT: seto %bl 3152; AVX-NEXT: movq %r10, %rax 3153; AVX-NEXT: mulq %rdi 3154; AVX-NEXT: seto %cl 3155; AVX-NEXT: orb %bl, %cl 3156; AVX-NEXT: leaq (%rsi,%rax), %rbx 3157; AVX-NEXT: movq %rdi, %rax 3158; AVX-NEXT: mulq %r8 3159; AVX-NEXT: movq %rax, %rdi 3160; AVX-NEXT: movq %rdx, %rsi 3161; AVX-NEXT: addq %rbx, %rsi 3162; AVX-NEXT: setb %r13b 3163; AVX-NEXT: orb %cl, %r13b 3164; AVX-NEXT: orb %bpl, %r13b 3165; AVX-NEXT: testq %r9, %r9 3166; AVX-NEXT: setne %al 3167; AVX-NEXT: testq %r12, %r12 3168; AVX-NEXT: setne %r10b 3169; AVX-NEXT: andb %al, %r10b 3170; AVX-NEXT: movq %r12, %rax 3171; AVX-NEXT: mulq %r14 3172; AVX-NEXT: movq %rax, %rbp 3173; AVX-NEXT: seto %r8b 3174; AVX-NEXT: movq %r9, %rax 3175; AVX-NEXT: mulq %r11 3176; AVX-NEXT: seto %cl 3177; AVX-NEXT: orb %r8b, %cl 3178; AVX-NEXT: addq %rax, %rbp 3179; AVX-NEXT: movq %r11, %rax 3180; AVX-NEXT: mulq %r14 3181; AVX-NEXT: addq %rbp, %rdx 3182; AVX-NEXT: setb %bl 3183; AVX-NEXT: orb %cl, %bl 3184; AVX-NEXT: orb %r10b, %bl 3185; AVX-NEXT: movzbl %bl, %ecx 3186; AVX-NEXT: negl %ecx 3187; AVX-NEXT: movzbl %r13b, %ebp 3188; AVX-NEXT: negl %ebp 3189; AVX-NEXT: vmovd %ebp, %xmm0 3190; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 3191; AVX-NEXT: movq %rax, 16(%r15) 3192; AVX-NEXT: movq %rdi, (%r15) 3193; AVX-NEXT: movq %rdx, 24(%r15) 3194; AVX-NEXT: movq %rsi, 8(%r15) 3195; AVX-NEXT: popq %rbx 3196; AVX-NEXT: popq %r12 3197; AVX-NEXT: popq %r13 3198; AVX-NEXT: popq %r14 3199; AVX-NEXT: popq %r15 3200; AVX-NEXT: popq %rbp 3201; AVX-NEXT: retq 3202; 3203; AVX512F-LABEL: umulo_v2i128: 3204; AVX512F: # %bb.0: 3205; AVX512F-NEXT: pushq %rbp 3206; AVX512F-NEXT: pushq %r15 3207; AVX512F-NEXT: pushq %r14 3208; AVX512F-NEXT: pushq %r12 3209; AVX512F-NEXT: pushq %rbx 3210; AVX512F-NEXT: movq %rcx, %rax 3211; AVX512F-NEXT: movq %rdx, %r12 3212; AVX512F-NEXT: movq %rdi, %r11 3213; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14 3214; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r15 3215; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10 3216; AVX512F-NEXT: testq %r10, %r10 3217; AVX512F-NEXT: setne %dl 3218; AVX512F-NEXT: testq %rcx, %rcx 3219; AVX512F-NEXT: setne %bl 3220; AVX512F-NEXT: andb %dl, %bl 3221; AVX512F-NEXT: mulq %r15 3222; AVX512F-NEXT: movq %rax, %rdi 3223; AVX512F-NEXT: seto %bpl 3224; AVX512F-NEXT: movq %r10, %rax 3225; AVX512F-NEXT: mulq %r12 3226; AVX512F-NEXT: seto %cl 3227; AVX512F-NEXT: orb %bpl, %cl 3228; AVX512F-NEXT: leaq (%rdi,%rax), %rbp 3229; AVX512F-NEXT: movq %r12, %rax 3230; AVX512F-NEXT: mulq %r15 3231; AVX512F-NEXT: movq %rax, %r10 3232; AVX512F-NEXT: movq %rdx, %rdi 3233; AVX512F-NEXT: addq %rbp, %rdi 3234; AVX512F-NEXT: setb %al 3235; AVX512F-NEXT: orb %cl, %al 3236; AVX512F-NEXT: orb %bl, %al 3237; AVX512F-NEXT: kmovw %eax, %k0 3238; AVX512F-NEXT: testq %r9, %r9 3239; AVX512F-NEXT: setne %al 3240; AVX512F-NEXT: testq %rsi, %rsi 3241; AVX512F-NEXT: setne %cl 3242; AVX512F-NEXT: andb %al, %cl 3243; AVX512F-NEXT: movq %rsi, %rax 3244; AVX512F-NEXT: mulq %r8 3245; AVX512F-NEXT: movq %rax, %rsi 3246; AVX512F-NEXT: seto %bpl 3247; AVX512F-NEXT: movq %r9, %rax 3248; AVX512F-NEXT: mulq %r11 3249; AVX512F-NEXT: seto %bl 3250; AVX512F-NEXT: orb %bpl, %bl 3251; AVX512F-NEXT: addq %rax, %rsi 3252; AVX512F-NEXT: movq %r11, %rax 3253; AVX512F-NEXT: mulq %r8 3254; AVX512F-NEXT: addq %rsi, %rdx 3255; AVX512F-NEXT: setb %sil 3256; AVX512F-NEXT: orb %bl, %sil 3257; AVX512F-NEXT: orb %cl, %sil 3258; AVX512F-NEXT: andl $1, %esi 3259; AVX512F-NEXT: kmovw %esi, %k1 3260; AVX512F-NEXT: kshiftlw $1, %k0, %k0 3261; AVX512F-NEXT: korw %k0, %k1, %k1 3262; AVX512F-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3263; AVX512F-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3264; AVX512F-NEXT: movq %r10, 16(%r14) 3265; AVX512F-NEXT: movq %rax, (%r14) 3266; AVX512F-NEXT: movq %rdi, 24(%r14) 3267; AVX512F-NEXT: movq %rdx, 8(%r14) 3268; AVX512F-NEXT: popq %rbx 3269; AVX512F-NEXT: popq %r12 3270; AVX512F-NEXT: popq %r14 3271; AVX512F-NEXT: popq %r15 3272; AVX512F-NEXT: popq %rbp 3273; AVX512F-NEXT: retq 3274; 3275; AVX512BW-LABEL: umulo_v2i128: 3276; AVX512BW: # %bb.0: 3277; AVX512BW-NEXT: pushq %rbp 3278; AVX512BW-NEXT: pushq %r15 3279; AVX512BW-NEXT: pushq %r14 3280; AVX512BW-NEXT: pushq %r12 3281; AVX512BW-NEXT: pushq %rbx 3282; AVX512BW-NEXT: movq %rcx, %rax 3283; AVX512BW-NEXT: movq %rdx, %r12 3284; AVX512BW-NEXT: movq %rdi, %r11 3285; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r14 3286; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r15 3287; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 3288; AVX512BW-NEXT: testq %r10, %r10 3289; AVX512BW-NEXT: setne %dl 3290; AVX512BW-NEXT: testq %rcx, %rcx 3291; AVX512BW-NEXT: setne %bl 3292; AVX512BW-NEXT: andb %dl, %bl 3293; AVX512BW-NEXT: mulq %r15 3294; AVX512BW-NEXT: movq %rax, %rdi 3295; AVX512BW-NEXT: seto %bpl 3296; AVX512BW-NEXT: movq %r10, %rax 3297; AVX512BW-NEXT: mulq %r12 3298; AVX512BW-NEXT: seto %cl 3299; AVX512BW-NEXT: orb %bpl, %cl 3300; AVX512BW-NEXT: leaq (%rdi,%rax), %rbp 3301; AVX512BW-NEXT: movq %r12, %rax 3302; AVX512BW-NEXT: mulq %r15 3303; AVX512BW-NEXT: movq %rax, %r10 3304; AVX512BW-NEXT: movq %rdx, %rdi 3305; AVX512BW-NEXT: addq %rbp, %rdi 3306; AVX512BW-NEXT: setb %al 3307; AVX512BW-NEXT: orb %cl, %al 3308; AVX512BW-NEXT: orb %bl, %al 3309; AVX512BW-NEXT: kmovd %eax, %k0 3310; AVX512BW-NEXT: testq %r9, %r9 3311; AVX512BW-NEXT: setne %al 3312; AVX512BW-NEXT: testq %rsi, %rsi 3313; AVX512BW-NEXT: setne %cl 3314; AVX512BW-NEXT: andb %al, %cl 3315; AVX512BW-NEXT: movq %rsi, %rax 3316; AVX512BW-NEXT: mulq %r8 3317; AVX512BW-NEXT: movq %rax, %rsi 3318; AVX512BW-NEXT: seto %bpl 3319; AVX512BW-NEXT: movq %r9, %rax 3320; AVX512BW-NEXT: mulq %r11 3321; AVX512BW-NEXT: seto %bl 3322; AVX512BW-NEXT: orb %bpl, %bl 3323; AVX512BW-NEXT: addq %rax, %rsi 3324; AVX512BW-NEXT: movq %r11, %rax 3325; AVX512BW-NEXT: mulq %r8 3326; AVX512BW-NEXT: addq %rsi, %rdx 3327; AVX512BW-NEXT: setb %sil 3328; AVX512BW-NEXT: orb %bl, %sil 3329; AVX512BW-NEXT: orb %cl, %sil 3330; AVX512BW-NEXT: andl $1, %esi 3331; AVX512BW-NEXT: kmovw %esi, %k1 3332; AVX512BW-NEXT: kshiftlw $1, %k0, %k0 3333; AVX512BW-NEXT: korw %k0, %k1, %k1 3334; AVX512BW-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 3335; AVX512BW-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 3336; AVX512BW-NEXT: movq %r10, 16(%r14) 3337; AVX512BW-NEXT: movq %rax, (%r14) 3338; AVX512BW-NEXT: movq %rdi, 24(%r14) 3339; AVX512BW-NEXT: movq %rdx, 8(%r14) 3340; AVX512BW-NEXT: popq %rbx 3341; AVX512BW-NEXT: popq %r12 3342; AVX512BW-NEXT: popq %r14 3343; AVX512BW-NEXT: popq %r15 3344; AVX512BW-NEXT: popq %rbp 3345; AVX512BW-NEXT: retq 3346 %t = call {<2 x i128>, <2 x i1>} @llvm.umul.with.overflow.v2i128(<2 x i128> %a0, <2 x i128> %a1) 3347 %val = extractvalue {<2 x i128>, <2 x i1>} %t, 0 3348 %obit = extractvalue {<2 x i128>, <2 x i1>} %t, 1 3349 %res = sext <2 x i1> %obit to <2 x i32> 3350 store <2 x i128> %val, ptr %p2 3351 ret <2 x i32> %res 3352} 3353