1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { 9; SSE2-LABEL: mul_v16i8c: 10; SSE2: # %bb.0: # %entry 11; SSE2-NEXT: movdqa %xmm0, %xmm1 12; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 13; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 14; SSE2-NEXT: pmullw %xmm2, %xmm1 15; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 16; SSE2-NEXT: pand %xmm3, %xmm1 17; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 18; SSE2-NEXT: pmullw %xmm2, %xmm0 19; SSE2-NEXT: pand %xmm3, %xmm0 20; SSE2-NEXT: packuswb %xmm1, %xmm0 21; SSE2-NEXT: retq 22; 23; SSE41-LABEL: mul_v16i8c: 24; SSE41: # %bb.0: # %entry 25; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 26; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 27; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 28; SSE41-NEXT: pmullw %xmm2, %xmm0 29; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 30; SSE41-NEXT: pand %xmm3, %xmm0 31; SSE41-NEXT: pmullw %xmm2, %xmm1 32; SSE41-NEXT: pand %xmm3, %xmm1 33; SSE41-NEXT: packuswb %xmm0, %xmm1 34; SSE41-NEXT: movdqa %xmm1, %xmm0 35; SSE41-NEXT: retq 36; 37; AVX2-LABEL: mul_v16i8c: 38; AVX2: # %bb.0: # %entry 39; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 40; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 41; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 42; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 43; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 44; AVX2-NEXT: vzeroupper 45; AVX2-NEXT: retq 46; 47; AVX512F-LABEL: mul_v16i8c: 48; AVX512F: # %bb.0: # %entry 49; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 50; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 51; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 52; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 53; AVX512F-NEXT: vzeroupper 54; AVX512F-NEXT: retq 55; 56; AVX512BW-LABEL: mul_v16i8c: 57; AVX512BW: # %bb.0: # %entry 58; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 59; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 60; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 61; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 62; AVX512BW-NEXT: vzeroupper 63; AVX512BW-NEXT: retq 64entry: 65 %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 66 ret <16 x i8> %A 67} 68 69define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind { 70; SSE-LABEL: mul_v8i16c: 71; SSE: # %bb.0: # %entry 72; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 73; SSE-NEXT: retq 74; 75; AVX-LABEL: mul_v8i16c: 76; AVX: # %bb.0: # %entry 77; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 78; AVX-NEXT: retq 79entry: 80 %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 81 ret <8 x i16> %A 82} 83 84define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { 85; SSE2-LABEL: mul_v4i32c: 86; SSE2: # %bb.0: # %entry 87; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] 88; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 89; SSE2-NEXT: pmuludq %xmm1, %xmm0 90; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 91; SSE2-NEXT: pmuludq %xmm1, %xmm2 92; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 93; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 94; SSE2-NEXT: retq 95; 96; SSE41-LABEL: mul_v4i32c: 97; SSE41: # %bb.0: # %entry 98; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 99; SSE41-NEXT: retq 100; 101; AVX-LABEL: mul_v4i32c: 102; AVX: # %bb.0: # %entry 103; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] 104; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 105; AVX-NEXT: retq 106entry: 107 %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > 108 ret <4 x i32> %A 109} 110 111define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { 112; SSE-LABEL: mul_v2i64c: 113; SSE: # %bb.0: # %entry 114; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] 115; SSE-NEXT: movdqa %xmm0, %xmm2 116; SSE-NEXT: pmuludq %xmm1, %xmm2 117; SSE-NEXT: psrlq $32, %xmm0 118; SSE-NEXT: pmuludq %xmm1, %xmm0 119; SSE-NEXT: psllq $32, %xmm0 120; SSE-NEXT: paddq %xmm2, %xmm0 121; SSE-NEXT: retq 122; 123; AVX-LABEL: mul_v2i64c: 124; AVX: # %bb.0: # %entry 125; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] 126; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 127; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 128; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 129; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 130; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 131; AVX-NEXT: retq 132entry: 133 %A = mul <2 x i64> %i, < i64 117, i64 117 > 134 ret <2 x i64> %A 135} 136 137define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { 138; SSE2-LABEL: mul_v16i8: 139; SSE2: # %bb.0: # %entry 140; SSE2-NEXT: movdqa %xmm1, %xmm2 141; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 142; SSE2-NEXT: movdqa %xmm0, %xmm3 143; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 144; SSE2-NEXT: pmullw %xmm2, %xmm3 145; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 146; SSE2-NEXT: pand %xmm2, %xmm3 147; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 148; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 149; SSE2-NEXT: pmullw %xmm1, %xmm0 150; SSE2-NEXT: pand %xmm2, %xmm0 151; SSE2-NEXT: packuswb %xmm3, %xmm0 152; SSE2-NEXT: retq 153; 154; SSE41-LABEL: mul_v16i8: 155; SSE41: # %bb.0: # %entry 156; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 157; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 158; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 159; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 160; SSE41-NEXT: pmullw %xmm1, %xmm0 161; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] 162; SSE41-NEXT: pand %xmm1, %xmm0 163; SSE41-NEXT: pmullw %xmm3, %xmm2 164; SSE41-NEXT: pand %xmm1, %xmm2 165; SSE41-NEXT: packuswb %xmm0, %xmm2 166; SSE41-NEXT: movdqa %xmm2, %xmm0 167; SSE41-NEXT: retq 168; 169; AVX2-LABEL: mul_v16i8: 170; AVX2: # %bb.0: # %entry 171; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 172; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 173; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 174; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 175; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 176; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 177; AVX2-NEXT: vzeroupper 178; AVX2-NEXT: retq 179; 180; AVX512F-LABEL: mul_v16i8: 181; AVX512F: # %bb.0: # %entry 182; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 183; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 184; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 185; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 186; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 187; AVX512F-NEXT: vzeroupper 188; AVX512F-NEXT: retq 189; 190; AVX512BW-LABEL: mul_v16i8: 191; AVX512BW: # %bb.0: # %entry 192; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero 193; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 194; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 195; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 196; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 197; AVX512BW-NEXT: vzeroupper 198; AVX512BW-NEXT: retq 199entry: 200 %A = mul <16 x i8> %i, %j 201 ret <16 x i8> %A 202} 203 204define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind { 205; SSE-LABEL: mul_v8i16: 206; SSE: # %bb.0: # %entry 207; SSE-NEXT: pmullw %xmm1, %xmm0 208; SSE-NEXT: retq 209; 210; AVX-LABEL: mul_v8i16: 211; AVX: # %bb.0: # %entry 212; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 213; AVX-NEXT: retq 214entry: 215 %A = mul <8 x i16> %i, %j 216 ret <8 x i16> %A 217} 218 219define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind { 220; SSE2-LABEL: mul_v4i32: 221; SSE2: # %bb.0: # %entry 222; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 223; SSE2-NEXT: pmuludq %xmm1, %xmm0 224; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 225; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 226; SSE2-NEXT: pmuludq %xmm2, %xmm1 227; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 228; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 229; SSE2-NEXT: retq 230; 231; SSE41-LABEL: mul_v4i32: 232; SSE41: # %bb.0: # %entry 233; SSE41-NEXT: pmulld %xmm1, %xmm0 234; SSE41-NEXT: retq 235; 236; AVX-LABEL: mul_v4i32: 237; AVX: # %bb.0: # %entry 238; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 239; AVX-NEXT: retq 240entry: 241 %A = mul <4 x i32> %i, %j 242 ret <4 x i32> %A 243} 244 245define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind { 246; SSE-LABEL: mul_v2i64: 247; SSE: # %bb.0: # %entry 248; SSE-NEXT: movdqa %xmm0, %xmm2 249; SSE-NEXT: psrlq $32, %xmm2 250; SSE-NEXT: pmuludq %xmm1, %xmm2 251; SSE-NEXT: movdqa %xmm1, %xmm3 252; SSE-NEXT: psrlq $32, %xmm3 253; SSE-NEXT: pmuludq %xmm0, %xmm3 254; SSE-NEXT: paddq %xmm2, %xmm3 255; SSE-NEXT: psllq $32, %xmm3 256; SSE-NEXT: pmuludq %xmm1, %xmm0 257; SSE-NEXT: paddq %xmm3, %xmm0 258; SSE-NEXT: retq 259; 260; AVX-LABEL: mul_v2i64: 261; AVX: # %bb.0: # %entry 262; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 263; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 264; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 265; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 266; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 267; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 268; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 269; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 270; AVX-NEXT: retq 271entry: 272 %A = mul <2 x i64> %i, %j 273 ret <2 x i64> %A 274} 275 276declare void @foo() 277 278define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind { 279; SSE2-LABEL: mul_v4i32spill: 280; SSE2: # %bb.0: # %entry 281; SSE2-NEXT: subq $40, %rsp 282; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 283; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 284; SSE2-NEXT: callq foo@PLT 285; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 286; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 287; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 288; SSE2-NEXT: pmuludq %xmm2, %xmm0 289; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 290; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 291; SSE2-NEXT: pmuludq %xmm1, %xmm2 292; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 293; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 294; SSE2-NEXT: addq $40, %rsp 295; SSE2-NEXT: retq 296; 297; SSE41-LABEL: mul_v4i32spill: 298; SSE41: # %bb.0: # %entry 299; SSE41-NEXT: subq $40, %rsp 300; SSE41-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 301; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 302; SSE41-NEXT: callq foo@PLT 303; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 304; SSE41-NEXT: pmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 305; SSE41-NEXT: addq $40, %rsp 306; SSE41-NEXT: retq 307; 308; AVX-LABEL: mul_v4i32spill: 309; AVX: # %bb.0: # %entry 310; AVX-NEXT: subq $40, %rsp 311; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 312; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 313; AVX-NEXT: callq foo@PLT 314; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 315; AVX-NEXT: vpmulld {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 316; AVX-NEXT: addq $40, %rsp 317; AVX-NEXT: retq 318entry: 319 ; Use a call to force spills. 320 call void @foo() 321 %A = mul <4 x i32> %i, %j 322 ret <4 x i32> %A 323} 324 325define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { 326; SSE-LABEL: mul_v2i64spill: 327; SSE: # %bb.0: # %entry 328; SSE-NEXT: subq $40, %rsp 329; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 330; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 331; SSE-NEXT: callq foo@PLT 332; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 333; SSE-NEXT: movdqa %xmm0, %xmm2 334; SSE-NEXT: psrlq $32, %xmm2 335; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 336; SSE-NEXT: pmuludq %xmm3, %xmm2 337; SSE-NEXT: movdqa %xmm3, %xmm1 338; SSE-NEXT: psrlq $32, %xmm1 339; SSE-NEXT: pmuludq %xmm0, %xmm1 340; SSE-NEXT: paddq %xmm2, %xmm1 341; SSE-NEXT: psllq $32, %xmm1 342; SSE-NEXT: pmuludq %xmm3, %xmm0 343; SSE-NEXT: paddq %xmm1, %xmm0 344; SSE-NEXT: addq $40, %rsp 345; SSE-NEXT: retq 346; 347; AVX-LABEL: mul_v2i64spill: 348; AVX: # %bb.0: # %entry 349; AVX-NEXT: subq $40, %rsp 350; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 351; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 352; AVX-NEXT: callq foo@PLT 353; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 354; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 355; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 356; AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 357; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1 358; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 359; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 360; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 361; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm1 362; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 363; AVX-NEXT: addq $40, %rsp 364; AVX-NEXT: retq 365entry: 366 ; Use a call to force spills. 367 call void @foo() 368 %A = mul <2 x i64> %i, %j 369 ret <2 x i64> %A 370} 371 372define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { 373; SSE2-LABEL: mul_v32i8c: 374; SSE2: # %bb.0: # %entry 375; SSE2-NEXT: movdqa %xmm0, %xmm2 376; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 377; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117] 378; SSE2-NEXT: pmullw %xmm3, %xmm2 379; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 380; SSE2-NEXT: pand %xmm4, %xmm2 381; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 382; SSE2-NEXT: pmullw %xmm3, %xmm0 383; SSE2-NEXT: pand %xmm4, %xmm0 384; SSE2-NEXT: packuswb %xmm2, %xmm0 385; SSE2-NEXT: movdqa %xmm1, %xmm2 386; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 387; SSE2-NEXT: pmullw %xmm3, %xmm2 388; SSE2-NEXT: pand %xmm4, %xmm2 389; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 390; SSE2-NEXT: pmullw %xmm3, %xmm1 391; SSE2-NEXT: pand %xmm4, %xmm1 392; SSE2-NEXT: packuswb %xmm2, %xmm1 393; SSE2-NEXT: retq 394; 395; SSE41-LABEL: mul_v32i8c: 396; SSE41: # %bb.0: # %entry 397; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 398; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 399; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] 400; SSE41-NEXT: pmullw %xmm4, %xmm0 401; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 402; SSE41-NEXT: pand %xmm5, %xmm0 403; SSE41-NEXT: pmullw %xmm4, %xmm2 404; SSE41-NEXT: pand %xmm5, %xmm2 405; SSE41-NEXT: packuswb %xmm0, %xmm2 406; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 407; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 408; SSE41-NEXT: pmullw %xmm4, %xmm1 409; SSE41-NEXT: pand %xmm5, %xmm1 410; SSE41-NEXT: pmullw %xmm4, %xmm3 411; SSE41-NEXT: pand %xmm5, %xmm3 412; SSE41-NEXT: packuswb %xmm1, %xmm3 413; SSE41-NEXT: movdqa %xmm2, %xmm0 414; SSE41-NEXT: movdqa %xmm3, %xmm1 415; SSE41-NEXT: retq 416; 417; AVX2-LABEL: mul_v32i8c: 418; AVX2: # %bb.0: # %entry 419; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 420; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 421; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 422; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 423; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 424; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 425; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 426; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 427; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 428; AVX2-NEXT: retq 429; 430; AVX512F-LABEL: mul_v32i8c: 431; AVX512F: # %bb.0: # %entry 432; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 433; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 434; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 435; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 436; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 437; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 438; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 439; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 440; AVX512F-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 441; AVX512F-NEXT: retq 442; 443; AVX512BW-LABEL: mul_v32i8c: 444; AVX512BW: # %bb.0: # %entry 445; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 446; AVX512BW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 447; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 448; AVX512BW-NEXT: retq 449entry: 450 %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 451 ret <32 x i8> %A 452} 453 454define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { 455; SSE-LABEL: mul_v16i16c: 456; SSE: # %bb.0: # %entry 457; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 458; SSE-NEXT: pmullw %xmm2, %xmm0 459; SSE-NEXT: pmullw %xmm2, %xmm1 460; SSE-NEXT: retq 461; 462; AVX-LABEL: mul_v16i16c: 463; AVX: # %bb.0: # %entry 464; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 465; AVX-NEXT: retq 466entry: 467 %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 468 ret <16 x i16> %A 469} 470 471define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { 472; SSE2-LABEL: mul_v8i32c: 473; SSE2: # %bb.0: # %entry 474; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 475; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 476; SSE2-NEXT: pmuludq %xmm2, %xmm0 477; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 478; SSE2-NEXT: pmuludq %xmm2, %xmm3 479; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 480; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 481; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 482; SSE2-NEXT: pmuludq %xmm2, %xmm1 483; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 484; SSE2-NEXT: pmuludq %xmm2, %xmm3 485; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 486; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 487; SSE2-NEXT: retq 488; 489; SSE41-LABEL: mul_v8i32c: 490; SSE41: # %bb.0: # %entry 491; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 492; SSE41-NEXT: pmulld %xmm2, %xmm0 493; SSE41-NEXT: pmulld %xmm2, %xmm1 494; SSE41-NEXT: retq 495; 496; AVX-LABEL: mul_v8i32c: 497; AVX: # %bb.0: # %entry 498; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] 499; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 500; AVX-NEXT: retq 501entry: 502 %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 > 503 ret <8 x i32> %A 504} 505 506define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { 507; SSE-LABEL: mul_v4i64c: 508; SSE: # %bb.0: # %entry 509; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] 510; SSE-NEXT: movdqa %xmm0, %xmm3 511; SSE-NEXT: pmuludq %xmm2, %xmm3 512; SSE-NEXT: psrlq $32, %xmm0 513; SSE-NEXT: pmuludq %xmm2, %xmm0 514; SSE-NEXT: psllq $32, %xmm0 515; SSE-NEXT: paddq %xmm3, %xmm0 516; SSE-NEXT: movdqa %xmm1, %xmm3 517; SSE-NEXT: pmuludq %xmm2, %xmm3 518; SSE-NEXT: psrlq $32, %xmm1 519; SSE-NEXT: pmuludq %xmm2, %xmm1 520; SSE-NEXT: psllq $32, %xmm1 521; SSE-NEXT: paddq %xmm3, %xmm1 522; SSE-NEXT: retq 523; 524; AVX-LABEL: mul_v4i64c: 525; AVX: # %bb.0: # %entry 526; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [117,117,117,117] 527; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 528; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 529; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 530; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 531; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 532; AVX-NEXT: retq 533entry: 534 %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > 535 ret <4 x i64> %A 536} 537 538define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { 539; SSE2-LABEL: mul_v32i8: 540; SSE2: # %bb.0: # %entry 541; SSE2-NEXT: movdqa %xmm2, %xmm4 542; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 543; SSE2-NEXT: movdqa %xmm0, %xmm5 544; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 545; SSE2-NEXT: pmullw %xmm4, %xmm5 546; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 547; SSE2-NEXT: pand %xmm4, %xmm5 548; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 549; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 550; SSE2-NEXT: pmullw %xmm2, %xmm0 551; SSE2-NEXT: pand %xmm4, %xmm0 552; SSE2-NEXT: packuswb %xmm5, %xmm0 553; SSE2-NEXT: movdqa %xmm3, %xmm2 554; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 555; SSE2-NEXT: movdqa %xmm1, %xmm5 556; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 557; SSE2-NEXT: pmullw %xmm2, %xmm5 558; SSE2-NEXT: pand %xmm4, %xmm5 559; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 560; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 561; SSE2-NEXT: pmullw %xmm3, %xmm1 562; SSE2-NEXT: pand %xmm4, %xmm1 563; SSE2-NEXT: packuswb %xmm5, %xmm1 564; SSE2-NEXT: retq 565; 566; SSE41-LABEL: mul_v32i8: 567; SSE41: # %bb.0: # %entry 568; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 569; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 570; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 571; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 572; SSE41-NEXT: pmullw %xmm2, %xmm0 573; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] 574; SSE41-NEXT: pand %xmm6, %xmm0 575; SSE41-NEXT: pmullw %xmm5, %xmm4 576; SSE41-NEXT: pand %xmm6, %xmm4 577; SSE41-NEXT: packuswb %xmm0, %xmm4 578; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 579; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 580; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero 581; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 582; SSE41-NEXT: pmullw %xmm3, %xmm1 583; SSE41-NEXT: pand %xmm6, %xmm1 584; SSE41-NEXT: pmullw %xmm0, %xmm2 585; SSE41-NEXT: pand %xmm6, %xmm2 586; SSE41-NEXT: packuswb %xmm1, %xmm2 587; SSE41-NEXT: movdqa %xmm4, %xmm0 588; SSE41-NEXT: movdqa %xmm2, %xmm1 589; SSE41-NEXT: retq 590; 591; AVX2-LABEL: mul_v32i8: 592; AVX2: # %bb.0: # %entry 593; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 594; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 595; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 596; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 597; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 598; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 599; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 600; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 601; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 602; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 603; AVX2-NEXT: retq 604; 605; AVX512F-LABEL: mul_v32i8: 606; AVX512F: # %bb.0: # %entry 607; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 608; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 609; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 610; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 611; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 612; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 613; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 614; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 615; AVX512F-NEXT: vpand %ymm3, %ymm0, %ymm0 616; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 617; AVX512F-NEXT: retq 618; 619; AVX512BW-LABEL: mul_v32i8: 620; AVX512BW: # %bb.0: # %entry 621; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero 622; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero 623; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 624; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 625; AVX512BW-NEXT: retq 626entry: 627 %A = mul <32 x i8> %i, %j 628 ret <32 x i8> %A 629} 630 631define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind { 632; SSE-LABEL: mul_v16i16: 633; SSE: # %bb.0: # %entry 634; SSE-NEXT: pmullw %xmm2, %xmm0 635; SSE-NEXT: pmullw %xmm3, %xmm1 636; SSE-NEXT: retq 637; 638; AVX-LABEL: mul_v16i16: 639; AVX: # %bb.0: # %entry 640; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 641; AVX-NEXT: retq 642entry: 643 %A = mul <16 x i16> %i, %j 644 ret <16 x i16> %A 645} 646 647define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind { 648; SSE2-LABEL: mul_v8i32: 649; SSE2: # %bb.0: # %entry 650; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 651; SSE2-NEXT: pmuludq %xmm2, %xmm0 652; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 653; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 654; SSE2-NEXT: pmuludq %xmm4, %xmm2 655; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 656; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 657; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 658; SSE2-NEXT: pmuludq %xmm3, %xmm1 659; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 660; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 661; SSE2-NEXT: pmuludq %xmm2, %xmm3 662; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 663; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 664; SSE2-NEXT: retq 665; 666; SSE41-LABEL: mul_v8i32: 667; SSE41: # %bb.0: # %entry 668; SSE41-NEXT: pmulld %xmm2, %xmm0 669; SSE41-NEXT: pmulld %xmm3, %xmm1 670; SSE41-NEXT: retq 671; 672; AVX-LABEL: mul_v8i32: 673; AVX: # %bb.0: # %entry 674; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 675; AVX-NEXT: retq 676entry: 677 %A = mul <8 x i32> %i, %j 678 ret <8 x i32> %A 679} 680 681define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind { 682; SSE-LABEL: mul_v4i64: 683; SSE: # %bb.0: # %entry 684; SSE-NEXT: movdqa %xmm0, %xmm4 685; SSE-NEXT: psrlq $32, %xmm4 686; SSE-NEXT: pmuludq %xmm2, %xmm4 687; SSE-NEXT: movdqa %xmm2, %xmm5 688; SSE-NEXT: psrlq $32, %xmm5 689; SSE-NEXT: pmuludq %xmm0, %xmm5 690; SSE-NEXT: paddq %xmm4, %xmm5 691; SSE-NEXT: psllq $32, %xmm5 692; SSE-NEXT: pmuludq %xmm2, %xmm0 693; SSE-NEXT: paddq %xmm5, %xmm0 694; SSE-NEXT: movdqa %xmm1, %xmm2 695; SSE-NEXT: psrlq $32, %xmm2 696; SSE-NEXT: pmuludq %xmm3, %xmm2 697; SSE-NEXT: movdqa %xmm3, %xmm4 698; SSE-NEXT: psrlq $32, %xmm4 699; SSE-NEXT: pmuludq %xmm1, %xmm4 700; SSE-NEXT: paddq %xmm2, %xmm4 701; SSE-NEXT: psllq $32, %xmm4 702; SSE-NEXT: pmuludq %xmm3, %xmm1 703; SSE-NEXT: paddq %xmm4, %xmm1 704; SSE-NEXT: retq 705; 706; AVX-LABEL: mul_v4i64: 707; AVX: # %bb.0: # %entry 708; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 709; AVX-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 710; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3 711; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 712; AVX-NEXT: vpaddq %ymm2, %ymm3, %ymm2 713; AVX-NEXT: vpsllq $32, %ymm2, %ymm2 714; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 715; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 716; AVX-NEXT: retq 717entry: 718 %A = mul <4 x i64> %i, %j 719 ret <4 x i64> %A 720} 721 722define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { 723; SSE2-LABEL: mul_v64i8c: 724; SSE2: # %bb.0: # %entry 725; SSE2-NEXT: movdqa %xmm0, %xmm6 726; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 727; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] 728; SSE2-NEXT: pmullw %xmm4, %xmm6 729; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 730; SSE2-NEXT: pand %xmm5, %xmm6 731; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 732; SSE2-NEXT: pmullw %xmm4, %xmm0 733; SSE2-NEXT: pand %xmm5, %xmm0 734; SSE2-NEXT: packuswb %xmm6, %xmm0 735; SSE2-NEXT: movdqa %xmm1, %xmm6 736; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 737; SSE2-NEXT: pmullw %xmm4, %xmm6 738; SSE2-NEXT: pand %xmm5, %xmm6 739; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 740; SSE2-NEXT: pmullw %xmm4, %xmm1 741; SSE2-NEXT: pand %xmm5, %xmm1 742; SSE2-NEXT: packuswb %xmm6, %xmm1 743; SSE2-NEXT: movdqa %xmm2, %xmm6 744; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 745; SSE2-NEXT: pmullw %xmm4, %xmm6 746; SSE2-NEXT: pand %xmm5, %xmm6 747; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 748; SSE2-NEXT: pmullw %xmm4, %xmm2 749; SSE2-NEXT: pand %xmm5, %xmm2 750; SSE2-NEXT: packuswb %xmm6, %xmm2 751; SSE2-NEXT: movdqa %xmm3, %xmm6 752; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 753; SSE2-NEXT: pmullw %xmm4, %xmm6 754; SSE2-NEXT: pand %xmm5, %xmm6 755; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 756; SSE2-NEXT: pmullw %xmm4, %xmm3 757; SSE2-NEXT: pand %xmm5, %xmm3 758; SSE2-NEXT: packuswb %xmm6, %xmm3 759; SSE2-NEXT: retq 760; 761; SSE41-LABEL: mul_v64i8c: 762; SSE41: # %bb.0: # %entry 763; SSE41-NEXT: movdqa %xmm1, %xmm4 764; SSE41-NEXT: movdqa %xmm0, %xmm1 765; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 766; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 767; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] 768; SSE41-NEXT: pmullw %xmm6, %xmm1 769; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 770; SSE41-NEXT: pand %xmm7, %xmm1 771; SSE41-NEXT: pmullw %xmm6, %xmm0 772; SSE41-NEXT: pand %xmm7, %xmm0 773; SSE41-NEXT: packuswb %xmm1, %xmm0 774; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 775; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 776; SSE41-NEXT: pmullw %xmm6, %xmm4 777; SSE41-NEXT: pand %xmm7, %xmm4 778; SSE41-NEXT: pmullw %xmm6, %xmm1 779; SSE41-NEXT: pand %xmm7, %xmm1 780; SSE41-NEXT: packuswb %xmm4, %xmm1 781; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 782; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 783; SSE41-NEXT: pmullw %xmm6, %xmm2 784; SSE41-NEXT: pand %xmm7, %xmm2 785; SSE41-NEXT: pmullw %xmm6, %xmm4 786; SSE41-NEXT: pand %xmm7, %xmm4 787; SSE41-NEXT: packuswb %xmm2, %xmm4 788; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 789; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 790; SSE41-NEXT: pmullw %xmm6, %xmm3 791; SSE41-NEXT: pand %xmm7, %xmm3 792; SSE41-NEXT: pmullw %xmm6, %xmm5 793; SSE41-NEXT: pand %xmm7, %xmm5 794; SSE41-NEXT: packuswb %xmm3, %xmm5 795; SSE41-NEXT: movdqa %xmm4, %xmm2 796; SSE41-NEXT: movdqa %xmm5, %xmm3 797; SSE41-NEXT: retq 798; 799; AVX2-LABEL: mul_v64i8c: 800; AVX2: # %bb.0: # %entry 801; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 802; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 803; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 804; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 805; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 806; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 807; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 808; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 809; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 810; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 811; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 812; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 813; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 814; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 815; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 816; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 817; AVX2-NEXT: retq 818; 819; AVX512F-LABEL: mul_v64i8c: 820; AVX512F: # %bb.0: # %entry 821; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 822; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 823; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 824; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 825; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 826; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 827; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 828; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 829; AVX512F-NEXT: vpand %ymm4, %ymm1, %ymm1 830; AVX512F-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 831; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 832; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 833; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 834; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 835; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 836; AVX512F-NEXT: vpand %ymm4, %ymm0, %ymm0 837; AVX512F-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 838; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 839; AVX512F-NEXT: retq 840; 841; AVX512BW-LABEL: mul_v64i8c: 842; AVX512BW: # %bb.0: # %entry 843; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 844; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 845; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 846; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 847; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1 848; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 849; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 850; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 851; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm0 852; AVX512BW-NEXT: retq 853entry: 854 %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 855 ret <64 x i8> %A 856} 857 858define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { 859; SSE2-LABEL: mul_v64i8: 860; SSE2: # %bb.0: # %entry 861; SSE2-NEXT: movdqa %xmm4, %xmm8 862; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 863; SSE2-NEXT: movdqa %xmm0, %xmm9 864; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 865; SSE2-NEXT: pmullw %xmm8, %xmm9 866; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 867; SSE2-NEXT: pand %xmm8, %xmm9 868; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 869; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 870; SSE2-NEXT: pmullw %xmm4, %xmm0 871; SSE2-NEXT: pand %xmm8, %xmm0 872; SSE2-NEXT: packuswb %xmm9, %xmm0 873; SSE2-NEXT: movdqa %xmm5, %xmm9 874; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 875; SSE2-NEXT: movdqa %xmm1, %xmm4 876; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 877; SSE2-NEXT: pmullw %xmm9, %xmm4 878; SSE2-NEXT: pand %xmm8, %xmm4 879; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 880; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 881; SSE2-NEXT: pmullw %xmm5, %xmm1 882; SSE2-NEXT: pand %xmm8, %xmm1 883; SSE2-NEXT: packuswb %xmm4, %xmm1 884; SSE2-NEXT: movdqa %xmm6, %xmm4 885; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 886; SSE2-NEXT: movdqa %xmm2, %xmm5 887; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 888; SSE2-NEXT: pmullw %xmm4, %xmm5 889; SSE2-NEXT: pand %xmm8, %xmm5 890; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 891; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 892; SSE2-NEXT: pmullw %xmm6, %xmm2 893; SSE2-NEXT: pand %xmm8, %xmm2 894; SSE2-NEXT: packuswb %xmm5, %xmm2 895; SSE2-NEXT: movdqa %xmm7, %xmm4 896; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 897; SSE2-NEXT: movdqa %xmm3, %xmm5 898; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 899; SSE2-NEXT: pmullw %xmm4, %xmm5 900; SSE2-NEXT: pand %xmm8, %xmm5 901; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 902; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 903; SSE2-NEXT: pmullw %xmm7, %xmm3 904; SSE2-NEXT: pand %xmm8, %xmm3 905; SSE2-NEXT: packuswb %xmm5, %xmm3 906; SSE2-NEXT: retq 907; 908; SSE41-LABEL: mul_v64i8: 909; SSE41: # %bb.0: # %entry 910; SSE41-NEXT: movdqa %xmm1, %xmm8 911; SSE41-NEXT: movdqa %xmm0, %xmm1 912; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero 913; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 914; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero 915; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 916; SSE41-NEXT: pmullw %xmm4, %xmm1 917; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 918; SSE41-NEXT: pand %xmm9, %xmm1 919; SSE41-NEXT: pmullw %xmm10, %xmm0 920; SSE41-NEXT: pand %xmm9, %xmm0 921; SSE41-NEXT: packuswb %xmm1, %xmm0 922; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero 923; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 924; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero 925; SSE41-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 926; SSE41-NEXT: pmullw %xmm5, %xmm8 927; SSE41-NEXT: pand %xmm9, %xmm8 928; SSE41-NEXT: pmullw %xmm4, %xmm1 929; SSE41-NEXT: pand %xmm9, %xmm1 930; SSE41-NEXT: packuswb %xmm8, %xmm1 931; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero 932; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 933; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero 934; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 935; SSE41-NEXT: pmullw %xmm6, %xmm2 936; SSE41-NEXT: pand %xmm9, %xmm2 937; SSE41-NEXT: pmullw %xmm5, %xmm4 938; SSE41-NEXT: pand %xmm9, %xmm4 939; SSE41-NEXT: packuswb %xmm2, %xmm4 940; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero 941; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 942; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero 943; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 944; SSE41-NEXT: pmullw %xmm7, %xmm3 945; SSE41-NEXT: pand %xmm9, %xmm3 946; SSE41-NEXT: pmullw %xmm2, %xmm5 947; SSE41-NEXT: pand %xmm9, %xmm5 948; SSE41-NEXT: packuswb %xmm3, %xmm5 949; SSE41-NEXT: movdqa %xmm4, %xmm2 950; SSE41-NEXT: movdqa %xmm5, %xmm3 951; SSE41-NEXT: retq 952; 953; AVX2-LABEL: mul_v64i8: 954; AVX2: # %bb.0: # %entry 955; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 956; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 957; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 958; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 959; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 960; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 961; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 962; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 963; AVX2-NEXT: vpand %ymm5, %ymm0, %ymm0 964; AVX2-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 965; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 966; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 967; AVX2-NEXT: vpmullw %ymm2, %ymm4, %ymm2 968; AVX2-NEXT: vpand %ymm5, %ymm2, %ymm2 969; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 970; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 971; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 972; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 973; AVX2-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 974; AVX2-NEXT: retq 975; 976; AVX512F-LABEL: mul_v64i8: 977; AVX512F: # %bb.0: # %entry 978; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2 979; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 980; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 981; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 982; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 983; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 984; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 985; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 986; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 987; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 988; AVX512F-NEXT: vpand %ymm5, %ymm2, %ymm2 989; AVX512F-NEXT: vpackuswb %ymm3, %ymm2, %ymm2 990; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 991; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 992; AVX512F-NEXT: vpmullw %ymm3, %ymm4, %ymm3 993; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 994; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 995; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 996; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 997; AVX512F-NEXT: vpand %ymm5, %ymm0, %ymm0 998; AVX512F-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 999; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 1000; AVX512F-NEXT: retq 1001; 1002; AVX512BW-LABEL: mul_v64i8: 1003; AVX512BW: # %bb.0: # %entry 1004; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1005; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] 1006; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 1007; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] 1008; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 1009; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1010; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] 1011; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1012; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 1013; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 1014; AVX512BW-NEXT: retq 1015entry: 1016 %A = mul <64 x i8> %i, %j 1017 ret <64 x i8> %A 1018} 1019 1020; PR30845 1021define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { 1022; SSE2-LABEL: mul_v4i64_zero_upper: 1023; SSE2: # %bb.0: # %entry 1024; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 1025; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 1026; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] 1027; SSE2-NEXT: pmuludq %xmm2, %xmm0 1028; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 1029; SSE2-NEXT: pmuludq %xmm3, %xmm1 1030; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1031; SSE2-NEXT: retq 1032; 1033; SSE41-LABEL: mul_v4i64_zero_upper: 1034; SSE41: # %bb.0: # %entry 1035; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero 1036; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 1037; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 1038; SSE41-NEXT: pmuludq %xmm2, %xmm0 1039; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] 1040; SSE41-NEXT: pmuludq %xmm3, %xmm1 1041; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1042; SSE41-NEXT: retq 1043; 1044; AVX-LABEL: mul_v4i64_zero_upper: 1045; AVX: # %bb.0: # %entry 1046; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1047; AVX-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1048; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1049; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1050; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1051; AVX-NEXT: vzeroupper 1052; AVX-NEXT: retq 1053entry: 1054 %val1a = zext <4 x i32> %val1 to <4 x i64> 1055 %val2a = zext <4 x i32> %val2 to <4 x i64> 1056 %res64 = mul <4 x i64> %val1a, %val2a 1057 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1058 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1059 ret <4 x i32> %res 1060} 1061 1062define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { 1063; SSE2-LABEL: mul_v4i64_zero_upper_left: 1064; SSE2: # %bb.0: # %entry 1065; SSE2-NEXT: pxor %xmm4, %xmm4 1066; SSE2-NEXT: movdqa %xmm0, %xmm3 1067; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1068; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1069; SSE2-NEXT: movdqa %xmm0, %xmm4 1070; SSE2-NEXT: pmuludq %xmm2, %xmm4 1071; SSE2-NEXT: psrlq $32, %xmm2 1072; SSE2-NEXT: pmuludq %xmm0, %xmm2 1073; SSE2-NEXT: psllq $32, %xmm2 1074; SSE2-NEXT: paddq %xmm4, %xmm2 1075; SSE2-NEXT: movdqa %xmm3, %xmm0 1076; SSE2-NEXT: pmuludq %xmm1, %xmm0 1077; SSE2-NEXT: psrlq $32, %xmm1 1078; SSE2-NEXT: pmuludq %xmm1, %xmm3 1079; SSE2-NEXT: psllq $32, %xmm3 1080; SSE2-NEXT: paddq %xmm0, %xmm3 1081; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1082; SSE2-NEXT: movaps %xmm3, %xmm0 1083; SSE2-NEXT: retq 1084; 1085; SSE41-LABEL: mul_v4i64_zero_upper_left: 1086; SSE41: # %bb.0: # %entry 1087; SSE41-NEXT: pxor %xmm4, %xmm4 1088; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 1089; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1090; SSE41-NEXT: movdqa %xmm0, %xmm4 1091; SSE41-NEXT: pmuludq %xmm2, %xmm4 1092; SSE41-NEXT: psrlq $32, %xmm2 1093; SSE41-NEXT: pmuludq %xmm0, %xmm2 1094; SSE41-NEXT: psllq $32, %xmm2 1095; SSE41-NEXT: paddq %xmm4, %xmm2 1096; SSE41-NEXT: movdqa %xmm3, %xmm0 1097; SSE41-NEXT: pmuludq %xmm1, %xmm0 1098; SSE41-NEXT: psrlq $32, %xmm1 1099; SSE41-NEXT: pmuludq %xmm1, %xmm3 1100; SSE41-NEXT: psllq $32, %xmm3 1101; SSE41-NEXT: paddq %xmm0, %xmm3 1102; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1103; SSE41-NEXT: movaps %xmm3, %xmm0 1104; SSE41-NEXT: retq 1105; 1106; AVX-LABEL: mul_v4i64_zero_upper_left: 1107; AVX: # %bb.0: # %entry 1108; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1109; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1110; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 1111; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1112; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 1113; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1114; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1115; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1116; AVX-NEXT: vzeroupper 1117; AVX-NEXT: retq 1118entry: 1119 %val1a = zext <4 x i32> %val1 to <4 x i64> 1120 %res64 = mul <4 x i64> %val1a, %val2 1121 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1122 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1123 ret <4 x i32> %res 1124} 1125 1126define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { 1127; SSE2-LABEL: mul_v4i64_zero_lower: 1128; SSE2: # %bb.0: # %entry 1129; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,1,3] 1130; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] 1131; SSE2-NEXT: psrlq $32, %xmm2 1132; SSE2-NEXT: pmuludq %xmm0, %xmm2 1133; SSE2-NEXT: psrlq $32, %xmm1 1134; SSE2-NEXT: pmuludq %xmm1, %xmm3 1135; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm2[0,2] 1136; SSE2-NEXT: movaps %xmm3, %xmm0 1137; SSE2-NEXT: retq 1138; 1139; SSE41-LABEL: mul_v4i64_zero_lower: 1140; SSE41: # %bb.0: # %entry 1141; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] 1142; SSE41-NEXT: psrlq $32, %xmm2 1143; SSE41-NEXT: pmuludq %xmm3, %xmm2 1144; SSE41-NEXT: psrlq $32, %xmm1 1145; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1146; SSE41-NEXT: pmuludq %xmm1, %xmm0 1147; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] 1148; SSE41-NEXT: retq 1149; 1150; AVX-LABEL: mul_v4i64_zero_lower: 1151; AVX: # %bb.0: # %entry 1152; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1153; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1 1154; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1155; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1 1156; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 1157; AVX-NEXT: vzeroupper 1158; AVX-NEXT: retq 1159entry: 1160 %val1a = zext <4 x i32> %val1 to <4 x i64> 1161 %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296> 1162 %res64 = mul <4 x i64> %val1a, %val2a 1163 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1164 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1165 ret <4 x i32> %res 1166} 1167 1168define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { 1169; SSE2-LABEL: mul_v8i64_zero_upper: 1170; SSE2: # %bb.0: # %entry 1171; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,1,1,3] 1172; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] 1173; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,1,1,3] 1174; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] 1175; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] 1176; SSE2-NEXT: pmuludq %xmm4, %xmm0 1177; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] 1178; SSE2-NEXT: pmuludq %xmm5, %xmm1 1179; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1180; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,1,3] 1181; SSE2-NEXT: pmuludq %xmm6, %xmm1 1182; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] 1183; SSE2-NEXT: pmuludq %xmm7, %xmm2 1184; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1185; SSE2-NEXT: retq 1186; 1187; SSE41-LABEL: mul_v8i64_zero_upper: 1188; SSE41: # %bb.0: # %entry 1189; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero 1190; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,1,3,3] 1191; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero 1192; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,1,3,3] 1193; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero 1194; SSE41-NEXT: pmuludq %xmm4, %xmm0 1195; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] 1196; SSE41-NEXT: pmuludq %xmm5, %xmm1 1197; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1198; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero 1199; SSE41-NEXT: pmuludq %xmm6, %xmm1 1200; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,3,3] 1201; SSE41-NEXT: pmuludq %xmm7, %xmm2 1202; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] 1203; SSE41-NEXT: retq 1204; 1205; AVX2-LABEL: mul_v8i64_zero_upper: 1206; AVX2: # %bb.0: # %entry 1207; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1208; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1209; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1210; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1211; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 1212; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1213; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1214; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1215; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] 1216; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1217; AVX2-NEXT: retq 1218; 1219; AVX512-LABEL: mul_v8i64_zero_upper: 1220; AVX512: # %bb.0: # %entry 1221; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 1222; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 1223; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 1224; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1225; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1226; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] 1227; AVX512-NEXT: retq 1228entry: 1229 %val1a = zext <8 x i32> %val1 to <8 x i64> 1230 %val2a = zext <8 x i32> %val2 to <8 x i64> 1231 %res64 = mul <8 x i64> %val1a, %val2a 1232 %rescast = bitcast <8 x i64> %res64 to <16 x i32> 1233 %res = shufflevector <16 x i32> %rescast, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 9, i32 11, i32 13, i32 15 > 1234 ret <8 x i32> %res 1235} 1236 1237define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { 1238; SSE2-LABEL: mul_v8i64_sext: 1239; SSE2: # %bb.0: 1240; SSE2-NEXT: movdqa %xmm1, %xmm15 1241; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] 1242; SSE2-NEXT: psrad $16, %xmm14 1243; SSE2-NEXT: pxor %xmm13, %xmm13 1244; SSE2-NEXT: pxor %xmm10, %xmm10 1245; SSE2-NEXT: pcmpgtd %xmm14, %xmm10 1246; SSE2-NEXT: movdqa %xmm14, %xmm8 1247; SSE2-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] 1248; SSE2-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] 1249; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1250; SSE2-NEXT: psrad $16, %xmm0 1251; SSE2-NEXT: pxor %xmm5, %xmm5 1252; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 1253; SSE2-NEXT: movdqa %xmm0, %xmm11 1254; SSE2-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm5[2],xmm11[3],xmm5[3] 1255; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 1256; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] 1257; SSE2-NEXT: pxor %xmm9, %xmm9 1258; SSE2-NEXT: pcmpgtd %xmm3, %xmm9 1259; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] 1260; SSE2-NEXT: pxor %xmm12, %xmm12 1261; SSE2-NEXT: pcmpgtd %xmm2, %xmm12 1262; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] 1263; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 1264; SSE2-NEXT: pxor %xmm7, %xmm7 1265; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 1266; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] 1267; SSE2-NEXT: pcmpgtd %xmm15, %xmm13 1268; SSE2-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] 1269; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,1,3] 1270; SSE2-NEXT: pmuludq %xmm15, %xmm6 1271; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,1,1,3] 1272; SSE2-NEXT: pmuludq %xmm0, %xmm4 1273; SSE2-NEXT: paddq %xmm6, %xmm4 1274; SSE2-NEXT: psllq $32, %xmm4 1275; SSE2-NEXT: pmuludq %xmm15, %xmm0 1276; SSE2-NEXT: paddq %xmm4, %xmm0 1277; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,1,3,3] 1278; SSE2-NEXT: pmuludq %xmm1, %xmm4 1279; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,1,1,3] 1280; SSE2-NEXT: pmuludq %xmm11, %xmm5 1281; SSE2-NEXT: paddq %xmm4, %xmm5 1282; SSE2-NEXT: psllq $32, %xmm5 1283; SSE2-NEXT: pmuludq %xmm11, %xmm1 1284; SSE2-NEXT: paddq %xmm5, %xmm1 1285; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] 1286; SSE2-NEXT: pmuludq %xmm2, %xmm4 1287; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] 1288; SSE2-NEXT: pmuludq %xmm14, %xmm5 1289; SSE2-NEXT: paddq %xmm4, %xmm5 1290; SSE2-NEXT: psllq $32, %xmm5 1291; SSE2-NEXT: pmuludq %xmm14, %xmm2 1292; SSE2-NEXT: paddq %xmm5, %xmm2 1293; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,1,3,3] 1294; SSE2-NEXT: pmuludq %xmm3, %xmm4 1295; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,1,3] 1296; SSE2-NEXT: pmuludq %xmm8, %xmm5 1297; SSE2-NEXT: paddq %xmm4, %xmm5 1298; SSE2-NEXT: psllq $32, %xmm5 1299; SSE2-NEXT: pmuludq %xmm8, %xmm3 1300; SSE2-NEXT: paddq %xmm5, %xmm3 1301; SSE2-NEXT: retq 1302; 1303; SSE41-LABEL: mul_v8i64_sext: 1304; SSE41: # %bb.0: 1305; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 1306; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 1307; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] 1308; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 1309; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 1310; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 1311; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 1312; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,1,3,3] 1313; SSE41-NEXT: pmuldq %xmm4, %xmm3 1314; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero 1315; SSE41-NEXT: pmuldq %xmm5, %xmm2 1316; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,1,3,3] 1317; SSE41-NEXT: pmuldq %xmm6, %xmm4 1318; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero 1319; SSE41-NEXT: pmuldq %xmm7, %xmm0 1320; SSE41-NEXT: movdqa %xmm4, %xmm1 1321; SSE41-NEXT: retq 1322; 1323; AVX2-LABEL: mul_v8i64_sext: 1324; AVX2: # %bb.0: 1325; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] 1326; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 1327; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 1328; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1329; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero 1330; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 1331; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1332; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1333; AVX2-NEXT: vmovdqa %ymm2, %ymm1 1334; AVX2-NEXT: retq 1335; 1336; AVX512-LABEL: mul_v8i64_sext: 1337; AVX512: # %bb.0: 1338; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 1339; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 1340; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 1341; AVX512-NEXT: retq 1342 %1 = sext <8 x i16> %val1 to <8 x i64> 1343 %2 = sext <8 x i32> %val2 to <8 x i64> 1344 %3 = mul <8 x i64> %1, %2 1345 ret <8 x i64> %3 1346} 1347 1348define <2 x i64> @pmuldq_square(<2 x i64> %x) { 1349; SSE2-LABEL: pmuldq_square: 1350; SSE2: # %bb.0: 1351; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] 1352; SSE2-NEXT: psllq $32, %xmm0 1353; SSE2-NEXT: psrad $31, %xmm0 1354; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3] 1355; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 1356; SSE2-NEXT: psrlq $32, %xmm0 1357; SSE2-NEXT: pmuludq %xmm1, %xmm0 1358; SSE2-NEXT: pmuludq %xmm1, %xmm1 1359; SSE2-NEXT: paddq %xmm0, %xmm0 1360; SSE2-NEXT: psllq $32, %xmm0 1361; SSE2-NEXT: paddq %xmm1, %xmm0 1362; SSE2-NEXT: retq 1363; 1364; SSE41-LABEL: pmuldq_square: 1365; SSE41: # %bb.0: 1366; SSE41-NEXT: pmuldq %xmm0, %xmm0 1367; SSE41-NEXT: retq 1368; 1369; AVX-LABEL: pmuldq_square: 1370; AVX: # %bb.0: 1371; AVX-NEXT: vpmuldq %xmm0, %xmm0, %xmm0 1372; AVX-NEXT: retq 1373 %1 = shl <2 x i64> %x, <i64 32, i64 32> 1374 %2 = ashr exact <2 x i64> %1, <i64 32, i64 32> 1375 %3 = mul nsw <2 x i64> %2, %2 1376 ret <2 x i64> %3 1377} 1378 1379define <2 x i64> @pmuludq_square(<2 x i64> %x) { 1380; SSE-LABEL: pmuludq_square: 1381; SSE: # %bb.0: 1382; SSE-NEXT: pmuludq %xmm0, %xmm0 1383; SSE-NEXT: retq 1384; 1385; AVX-LABEL: pmuludq_square: 1386; AVX: # %bb.0: 1387; AVX-NEXT: vpmuludq %xmm0, %xmm0, %xmm0 1388; AVX-NEXT: retq 1389 %1 = and <2 x i64> %x, <i64 4294967295, i64 4294967295> 1390 %2 = mul nuw <2 x i64> %1, %1 1391 ret <2 x i64> %2 1392} 1393