1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 7 8define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind { 9; SSE2-LABEL: mul_v16i8c: 10; SSE2: # BB#0: # %entry 11; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 12; SSE2-NEXT: psraw $8, %xmm1 13; SSE2-NEXT: movdqa %xmm0, %xmm2 14; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 15; SSE2-NEXT: psraw $8, %xmm2 16; SSE2-NEXT: pmullw %xmm1, %xmm2 17; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 18; SSE2-NEXT: pand %xmm3, %xmm2 19; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 20; SSE2-NEXT: psraw $8, %xmm0 21; SSE2-NEXT: pmullw %xmm1, %xmm0 22; SSE2-NEXT: pand %xmm3, %xmm0 23; SSE2-NEXT: packuswb %xmm2, %xmm0 24; SSE2-NEXT: retq 25; 26; SSE41-LABEL: mul_v16i8c: 27; SSE41: # BB#0: # %entry 28; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 29; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 30; SSE41-NEXT: pmullw %xmm2, %xmm1 31; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 32; SSE41-NEXT: pand %xmm3, %xmm1 33; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 34; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 35; SSE41-NEXT: pmullw %xmm2, %xmm0 36; SSE41-NEXT: pand %xmm3, %xmm0 37; SSE41-NEXT: packuswb %xmm0, %xmm1 38; SSE41-NEXT: movdqa %xmm1, %xmm0 39; SSE41-NEXT: retq 40; 41; AVX2-LABEL: mul_v16i8c: 42; AVX2: # BB#0: # %entry 43; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 44; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 45; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 46; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 47; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 48; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 49; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 50; AVX2-NEXT: vzeroupper 51; AVX2-NEXT: retq 52; 53; AVX512F-LABEL: mul_v16i8c: 54; AVX512F: # BB#0: # %entry 55; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 56; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 57; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 58; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 59; AVX512F-NEXT: retq 60; 61; AVX512BW-LABEL: mul_v16i8c: 62; AVX512BW: # BB#0: # %entry 63; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 64; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 65; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 66; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 67; AVX512BW-NEXT: retq 68entry: 69 %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 70 ret <16 x i8> %A 71} 72 73define <8 x i16> @mul_v8i16c(<8 x i16> %i) nounwind { 74; SSE-LABEL: mul_v8i16c: 75; SSE: # BB#0: # %entry 76; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 77; SSE-NEXT: retq 78; 79; AVX-LABEL: mul_v8i16c: 80; AVX: # BB#0: # %entry 81; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 82; AVX-NEXT: retq 83entry: 84 %A = mul <8 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 85 ret <8 x i16> %A 86} 87 88define <4 x i32> @mul_v4i32c(<4 x i32> %i) nounwind { 89; SSE2-LABEL: mul_v4i32c: 90; SSE2: # BB#0: # %entry 91; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [117,117,117,117] 92; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 93; SSE2-NEXT: pmuludq %xmm1, %xmm0 94; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 95; SSE2-NEXT: pmuludq %xmm1, %xmm2 96; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 97; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 98; SSE2-NEXT: retq 99; 100; SSE41-LABEL: mul_v4i32c: 101; SSE41: # BB#0: # %entry 102; SSE41-NEXT: pmulld {{.*}}(%rip), %xmm0 103; SSE41-NEXT: retq 104; 105; AVX-LABEL: mul_v4i32c: 106; AVX: # BB#0: # %entry 107; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1 108; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 109; AVX-NEXT: retq 110entry: 111 %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > 112 ret <4 x i32> %A 113} 114 115define <2 x i64> @mul_v2i64c(<2 x i64> %i) nounwind { 116; SSE-LABEL: mul_v2i64c: 117; SSE: # BB#0: # %entry 118; SSE-NEXT: movdqa {{.*#+}} xmm1 = [117,117] 119; SSE-NEXT: movdqa %xmm0, %xmm2 120; SSE-NEXT: pmuludq %xmm1, %xmm2 121; SSE-NEXT: psrlq $32, %xmm0 122; SSE-NEXT: pmuludq %xmm1, %xmm0 123; SSE-NEXT: psllq $32, %xmm0 124; SSE-NEXT: paddq %xmm2, %xmm0 125; SSE-NEXT: retq 126; 127; AVX-LABEL: mul_v2i64c: 128; AVX: # BB#0: # %entry 129; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] 130; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 131; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 132; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 133; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 134; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 135; AVX-NEXT: retq 136entry: 137 %A = mul <2 x i64> %i, < i64 117, i64 117 > 138 ret <2 x i64> %A 139} 140 141define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind { 142; SSE2-LABEL: mul_v16i8: 143; SSE2: # BB#0: # %entry 144; SSE2-NEXT: movdqa %xmm1, %xmm2 145; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 146; SSE2-NEXT: psraw $8, %xmm2 147; SSE2-NEXT: movdqa %xmm0, %xmm3 148; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 149; SSE2-NEXT: psraw $8, %xmm3 150; SSE2-NEXT: pmullw %xmm2, %xmm3 151; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 152; SSE2-NEXT: pand %xmm2, %xmm3 153; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 154; SSE2-NEXT: psraw $8, %xmm1 155; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 156; SSE2-NEXT: psraw $8, %xmm0 157; SSE2-NEXT: pmullw %xmm1, %xmm0 158; SSE2-NEXT: pand %xmm2, %xmm0 159; SSE2-NEXT: packuswb %xmm3, %xmm0 160; SSE2-NEXT: retq 161; 162; SSE41-LABEL: mul_v16i8: 163; SSE41: # BB#0: # %entry 164; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 165; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 166; SSE41-NEXT: pmullw %xmm3, %xmm2 167; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] 168; SSE41-NEXT: pand %xmm3, %xmm2 169; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 170; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 171; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 172; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 173; SSE41-NEXT: pmullw %xmm1, %xmm0 174; SSE41-NEXT: pand %xmm3, %xmm0 175; SSE41-NEXT: packuswb %xmm0, %xmm2 176; SSE41-NEXT: movdqa %xmm2, %xmm0 177; SSE41-NEXT: retq 178; 179; AVX2-LABEL: mul_v16i8: 180; AVX2: # BB#0: # %entry 181; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 182; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 183; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 184; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 185; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 186; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 187; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 188; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 189; AVX2-NEXT: vzeroupper 190; AVX2-NEXT: retq 191; 192; AVX512F-LABEL: mul_v16i8: 193; AVX512F: # BB#0: # %entry 194; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 195; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 196; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 197; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 198; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 199; AVX512F-NEXT: retq 200; 201; AVX512BW-LABEL: mul_v16i8: 202; AVX512BW: # BB#0: # %entry 203; AVX512BW-NEXT: vpmovsxbw %xmm1, %ymm1 204; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 205; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 206; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 207; AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<kill> 208; AVX512BW-NEXT: retq 209entry: 210 %A = mul <16 x i8> %i, %j 211 ret <16 x i8> %A 212} 213 214define <8 x i16> @mul_v8i16(<8 x i16> %i, <8 x i16> %j) nounwind { 215; SSE-LABEL: mul_v8i16: 216; SSE: # BB#0: # %entry 217; SSE-NEXT: pmullw %xmm1, %xmm0 218; SSE-NEXT: retq 219; 220; AVX-LABEL: mul_v8i16: 221; AVX: # BB#0: # %entry 222; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 223; AVX-NEXT: retq 224entry: 225 %A = mul <8 x i16> %i, %j 226 ret <8 x i16> %A 227} 228 229define <4 x i32> @mul_v4i32(<4 x i32> %i, <4 x i32> %j) nounwind { 230; SSE2-LABEL: mul_v4i32: 231; SSE2: # BB#0: # %entry 232; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] 233; SSE2-NEXT: pmuludq %xmm1, %xmm0 234; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 235; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 236; SSE2-NEXT: pmuludq %xmm2, %xmm1 237; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 238; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 239; SSE2-NEXT: retq 240; 241; SSE41-LABEL: mul_v4i32: 242; SSE41: # BB#0: # %entry 243; SSE41-NEXT: pmulld %xmm1, %xmm0 244; SSE41-NEXT: retq 245; 246; AVX-LABEL: mul_v4i32: 247; AVX: # BB#0: # %entry 248; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 249; AVX-NEXT: retq 250entry: 251 %A = mul <4 x i32> %i, %j 252 ret <4 x i32> %A 253} 254 255define <2 x i64> @mul_v2i64(<2 x i64> %i, <2 x i64> %j) nounwind { 256; SSE-LABEL: mul_v2i64: 257; SSE: # BB#0: # %entry 258; SSE-NEXT: movdqa %xmm0, %xmm2 259; SSE-NEXT: psrlq $32, %xmm2 260; SSE-NEXT: pmuludq %xmm1, %xmm2 261; SSE-NEXT: movdqa %xmm1, %xmm3 262; SSE-NEXT: psrlq $32, %xmm3 263; SSE-NEXT: pmuludq %xmm0, %xmm3 264; SSE-NEXT: paddq %xmm2, %xmm3 265; SSE-NEXT: psllq $32, %xmm3 266; SSE-NEXT: pmuludq %xmm1, %xmm0 267; SSE-NEXT: paddq %xmm3, %xmm0 268; SSE-NEXT: retq 269; 270; AVX-LABEL: mul_v2i64: 271; AVX: # BB#0: # %entry 272; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 273; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 274; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 275; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 276; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 277; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 278; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 279; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 280; AVX-NEXT: retq 281entry: 282 %A = mul <2 x i64> %i, %j 283 ret <2 x i64> %A 284} 285 286declare void @foo() 287 288define <4 x i32> @mul_v4i32spill(<4 x i32> %i, <4 x i32> %j) nounwind { 289; SSE2-LABEL: mul_v4i32spill: 290; SSE2: # BB#0: # %entry 291; SSE2-NEXT: subq $40, %rsp 292; SSE2-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 293; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 294; SSE2-NEXT: callq foo 295; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 296; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] 297; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 298; SSE2-NEXT: pmuludq %xmm2, %xmm0 299; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 300; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 301; SSE2-NEXT: pmuludq %xmm1, %xmm2 302; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 303; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 304; SSE2-NEXT: addq $40, %rsp 305; SSE2-NEXT: retq 306; 307; SSE41-LABEL: mul_v4i32spill: 308; SSE41: # BB#0: # %entry 309; SSE41-NEXT: subq $40, %rsp 310; SSE41-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 311; SSE41-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 312; SSE41-NEXT: callq foo 313; SSE41-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 314; SSE41-NEXT: pmulld {{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload 315; SSE41-NEXT: addq $40, %rsp 316; SSE41-NEXT: retq 317; 318; AVX-LABEL: mul_v4i32spill: 319; AVX: # BB#0: # %entry 320; AVX-NEXT: subq $40, %rsp 321; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 322; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 323; AVX-NEXT: callq foo 324; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 325; AVX-NEXT: vpmulld {{[0-9]+}}(%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 326; AVX-NEXT: addq $40, %rsp 327; AVX-NEXT: retq 328entry: 329 ; Use a call to force spills. 330 call void @foo() 331 %A = mul <4 x i32> %i, %j 332 ret <4 x i32> %A 333} 334 335define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { 336; SSE-LABEL: mul_v2i64spill: 337; SSE: # BB#0: # %entry 338; SSE-NEXT: subq $40, %rsp 339; SSE-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 340; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 341; SSE-NEXT: callq foo 342; SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload 343; SSE-NEXT: movdqa %xmm0, %xmm2 344; SSE-NEXT: psrlq $32, %xmm2 345; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload 346; SSE-NEXT: pmuludq %xmm3, %xmm2 347; SSE-NEXT: movdqa %xmm3, %xmm1 348; SSE-NEXT: psrlq $32, %xmm1 349; SSE-NEXT: pmuludq %xmm0, %xmm1 350; SSE-NEXT: paddq %xmm2, %xmm1 351; SSE-NEXT: psllq $32, %xmm1 352; SSE-NEXT: pmuludq %xmm3, %xmm0 353; SSE-NEXT: paddq %xmm1, %xmm0 354; SSE-NEXT: addq $40, %rsp 355; SSE-NEXT: retq 356; 357; AVX-LABEL: mul_v2i64spill: 358; AVX: # BB#0: # %entry 359; AVX-NEXT: subq $40, %rsp 360; AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill 361; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 362; AVX-NEXT: callq foo 363; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload 364; AVX-NEXT: vpsrlq $32, %xmm3, %xmm0 365; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload 366; AVX-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 367; AVX-NEXT: vpsrlq $32, %xmm2, %xmm1 368; AVX-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 369; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 370; AVX-NEXT: vpsllq $32, %xmm0, %xmm0 371; AVX-NEXT: vpmuludq %xmm2, %xmm3, %xmm1 372; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 373; AVX-NEXT: addq $40, %rsp 374; AVX-NEXT: retq 375entry: 376 ; Use a call to force spills. 377 call void @foo() 378 %A = mul <2 x i64> %i, %j 379 ret <2 x i64> %A 380} 381 382define <32 x i8> @mul_v32i8c(<32 x i8> %i) nounwind { 383; SSE2-LABEL: mul_v32i8c: 384; SSE2: # BB#0: # %entry 385; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 386; SSE2-NEXT: psraw $8, %xmm2 387; SSE2-NEXT: movdqa %xmm0, %xmm3 388; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 389; SSE2-NEXT: psraw $8, %xmm3 390; SSE2-NEXT: pmullw %xmm2, %xmm3 391; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 392; SSE2-NEXT: pand %xmm4, %xmm3 393; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 394; SSE2-NEXT: psraw $8, %xmm0 395; SSE2-NEXT: pmullw %xmm2, %xmm0 396; SSE2-NEXT: pand %xmm4, %xmm0 397; SSE2-NEXT: packuswb %xmm3, %xmm0 398; SSE2-NEXT: movdqa %xmm1, %xmm3 399; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 400; SSE2-NEXT: psraw $8, %xmm3 401; SSE2-NEXT: pmullw %xmm2, %xmm3 402; SSE2-NEXT: pand %xmm4, %xmm3 403; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 404; SSE2-NEXT: psraw $8, %xmm1 405; SSE2-NEXT: pmullw %xmm2, %xmm1 406; SSE2-NEXT: pand %xmm4, %xmm1 407; SSE2-NEXT: packuswb %xmm3, %xmm1 408; SSE2-NEXT: retq 409; 410; SSE41-LABEL: mul_v32i8c: 411; SSE41: # BB#0: # %entry 412; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 413; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117] 414; SSE41-NEXT: pmullw %xmm4, %xmm2 415; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 416; SSE41-NEXT: pand %xmm5, %xmm2 417; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 418; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 419; SSE41-NEXT: pmullw %xmm4, %xmm0 420; SSE41-NEXT: pand %xmm5, %xmm0 421; SSE41-NEXT: packuswb %xmm0, %xmm2 422; SSE41-NEXT: pmovsxbw %xmm1, %xmm3 423; SSE41-NEXT: pmullw %xmm4, %xmm3 424; SSE41-NEXT: pand %xmm5, %xmm3 425; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 426; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 427; SSE41-NEXT: pmullw %xmm4, %xmm0 428; SSE41-NEXT: pand %xmm5, %xmm0 429; SSE41-NEXT: packuswb %xmm0, %xmm3 430; SSE41-NEXT: movdqa %xmm2, %xmm0 431; SSE41-NEXT: movdqa %xmm3, %xmm1 432; SSE41-NEXT: retq 433; 434; AVX2-LABEL: mul_v32i8c: 435; AVX2: # BB#0: # %entry 436; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 437; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 438; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 439; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 440; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 441; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 442; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 443; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 444; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 445; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 446; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 447; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 448; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 449; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 450; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 451; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 452; AVX2-NEXT: retq 453; 454; AVX512F-LABEL: mul_v32i8c: 455; AVX512F: # BB#0: # %entry 456; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 457; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 458; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 459; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 460; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 461; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 462; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 463; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 464; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 465; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 466; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 467; AVX512F-NEXT: retq 468; 469; AVX512BW-LABEL: mul_v32i8c: 470; AVX512BW: # BB#0: # %entry 471; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 472; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %zmm0, %zmm0 473; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 474; AVX512BW-NEXT: retq 475entry: 476 %A = mul <32 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 477 ret <32 x i8> %A 478} 479 480define <16 x i16> @mul_v16i16c(<16 x i16> %i) nounwind { 481; SSE-LABEL: mul_v16i16c: 482; SSE: # BB#0: # %entry 483; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117] 484; SSE-NEXT: pmullw %xmm2, %xmm0 485; SSE-NEXT: pmullw %xmm2, %xmm1 486; SSE-NEXT: retq 487; 488; AVX-LABEL: mul_v16i16c: 489; AVX: # BB#0: # %entry 490; AVX-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 491; AVX-NEXT: retq 492entry: 493 %A = mul <16 x i16> %i, < i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117, i16 117 > 494 ret <16 x i16> %A 495} 496 497define <8 x i32> @mul_v8i32c(<8 x i32> %i) nounwind { 498; SSE2-LABEL: mul_v8i32c: 499; SSE2: # BB#0: # %entry 500; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 501; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 502; SSE2-NEXT: pmuludq %xmm2, %xmm0 503; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 504; SSE2-NEXT: pmuludq %xmm2, %xmm3 505; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 506; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 507; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] 508; SSE2-NEXT: pmuludq %xmm2, %xmm1 509; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 510; SSE2-NEXT: pmuludq %xmm2, %xmm3 511; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 512; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 513; SSE2-NEXT: retq 514; 515; SSE41-LABEL: mul_v8i32c: 516; SSE41: # BB#0: # %entry 517; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117] 518; SSE41-NEXT: pmulld %xmm2, %xmm0 519; SSE41-NEXT: pmulld %xmm2, %xmm1 520; SSE41-NEXT: retq 521; 522; AVX-LABEL: mul_v8i32c: 523; AVX: # BB#0: # %entry 524; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm1 525; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 526; AVX-NEXT: retq 527entry: 528 %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 > 529 ret <8 x i32> %A 530} 531 532define <4 x i64> @mul_v4i64c(<4 x i64> %i) nounwind { 533; SSE-LABEL: mul_v4i64c: 534; SSE: # BB#0: # %entry 535; SSE-NEXT: movdqa {{.*#+}} xmm2 = [117,117] 536; SSE-NEXT: movdqa %xmm0, %xmm3 537; SSE-NEXT: pmuludq %xmm2, %xmm3 538; SSE-NEXT: psrlq $32, %xmm0 539; SSE-NEXT: pmuludq %xmm2, %xmm0 540; SSE-NEXT: psllq $32, %xmm0 541; SSE-NEXT: paddq %xmm3, %xmm0 542; SSE-NEXT: movdqa %xmm1, %xmm3 543; SSE-NEXT: pmuludq %xmm2, %xmm3 544; SSE-NEXT: psrlq $32, %xmm1 545; SSE-NEXT: pmuludq %xmm2, %xmm1 546; SSE-NEXT: psllq $32, %xmm1 547; SSE-NEXT: paddq %xmm3, %xmm1 548; SSE-NEXT: retq 549; 550; AVX-LABEL: mul_v4i64c: 551; AVX: # BB#0: # %entry 552; AVX-NEXT: vpbroadcastq {{.*}}(%rip), %ymm1 553; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 554; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 555; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 556; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 557; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 558; AVX-NEXT: retq 559entry: 560 %A = mul <4 x i64> %i, < i64 117, i64 117, i64 117, i64 117 > 561 ret <4 x i64> %A 562} 563 564define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind { 565; SSE2-LABEL: mul_v32i8: 566; SSE2: # BB#0: # %entry 567; SSE2-NEXT: movdqa %xmm2, %xmm4 568; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 569; SSE2-NEXT: psraw $8, %xmm4 570; SSE2-NEXT: movdqa %xmm0, %xmm5 571; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 572; SSE2-NEXT: psraw $8, %xmm5 573; SSE2-NEXT: pmullw %xmm4, %xmm5 574; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] 575; SSE2-NEXT: pand %xmm4, %xmm5 576; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 577; SSE2-NEXT: psraw $8, %xmm2 578; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 579; SSE2-NEXT: psraw $8, %xmm0 580; SSE2-NEXT: pmullw %xmm2, %xmm0 581; SSE2-NEXT: pand %xmm4, %xmm0 582; SSE2-NEXT: packuswb %xmm5, %xmm0 583; SSE2-NEXT: movdqa %xmm3, %xmm2 584; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 585; SSE2-NEXT: psraw $8, %xmm2 586; SSE2-NEXT: movdqa %xmm1, %xmm5 587; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 588; SSE2-NEXT: psraw $8, %xmm5 589; SSE2-NEXT: pmullw %xmm2, %xmm5 590; SSE2-NEXT: pand %xmm4, %xmm5 591; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 592; SSE2-NEXT: psraw $8, %xmm3 593; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 594; SSE2-NEXT: psraw $8, %xmm1 595; SSE2-NEXT: pmullw %xmm3, %xmm1 596; SSE2-NEXT: pand %xmm4, %xmm1 597; SSE2-NEXT: packuswb %xmm5, %xmm1 598; SSE2-NEXT: retq 599; 600; SSE41-LABEL: mul_v32i8: 601; SSE41: # BB#0: # %entry 602; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 603; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 604; SSE41-NEXT: pmullw %xmm5, %xmm4 605; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 606; SSE41-NEXT: pand %xmm5, %xmm4 607; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 608; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 609; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 610; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 611; SSE41-NEXT: pmullw %xmm2, %xmm0 612; SSE41-NEXT: pand %xmm5, %xmm0 613; SSE41-NEXT: packuswb %xmm0, %xmm4 614; SSE41-NEXT: pmovsxbw %xmm3, %xmm0 615; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 616; SSE41-NEXT: pmullw %xmm0, %xmm2 617; SSE41-NEXT: pand %xmm5, %xmm2 618; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] 619; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 620; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 621; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 622; SSE41-NEXT: pmullw %xmm0, %xmm1 623; SSE41-NEXT: pand %xmm5, %xmm1 624; SSE41-NEXT: packuswb %xmm1, %xmm2 625; SSE41-NEXT: movdqa %xmm4, %xmm0 626; SSE41-NEXT: movdqa %xmm2, %xmm1 627; SSE41-NEXT: retq 628; 629; AVX2-LABEL: mul_v32i8: 630; AVX2: # BB#0: # %entry 631; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 632; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 633; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 634; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 635; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 636; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 637; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 638; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 639; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 640; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] 641; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 642; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 643; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 644; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 645; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 646; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 647; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] 648; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 649; AVX2-NEXT: retq 650; 651; AVX512F-LABEL: mul_v32i8: 652; AVX512F: # BB#0: # %entry 653; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2 654; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm3 655; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 656; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 657; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 658; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 659; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 660; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 661; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 662; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 663; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 664; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 665; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 666; AVX512F-NEXT: retq 667; 668; AVX512BW-LABEL: mul_v32i8: 669; AVX512BW: # BB#0: # %entry 670; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 671; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 672; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 673; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 674; AVX512BW-NEXT: retq 675entry: 676 %A = mul <32 x i8> %i, %j 677 ret <32 x i8> %A 678} 679 680define <16 x i16> @mul_v16i16(<16 x i16> %i, <16 x i16> %j) nounwind { 681; SSE-LABEL: mul_v16i16: 682; SSE: # BB#0: # %entry 683; SSE-NEXT: pmullw %xmm2, %xmm0 684; SSE-NEXT: pmullw %xmm3, %xmm1 685; SSE-NEXT: retq 686; 687; AVX-LABEL: mul_v16i16: 688; AVX: # BB#0: # %entry 689; AVX-NEXT: vpmullw %ymm1, %ymm0, %ymm0 690; AVX-NEXT: retq 691entry: 692 %A = mul <16 x i16> %i, %j 693 ret <16 x i16> %A 694} 695 696define <8 x i32> @mul_v8i32(<8 x i32> %i, <8 x i32> %j) nounwind { 697; SSE2-LABEL: mul_v8i32: 698; SSE2: # BB#0: # %entry 699; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] 700; SSE2-NEXT: pmuludq %xmm2, %xmm0 701; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 702; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] 703; SSE2-NEXT: pmuludq %xmm4, %xmm2 704; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 705; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 706; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] 707; SSE2-NEXT: pmuludq %xmm3, %xmm1 708; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 709; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 710; SSE2-NEXT: pmuludq %xmm2, %xmm3 711; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 712; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] 713; SSE2-NEXT: retq 714; 715; SSE41-LABEL: mul_v8i32: 716; SSE41: # BB#0: # %entry 717; SSE41-NEXT: pmulld %xmm2, %xmm0 718; SSE41-NEXT: pmulld %xmm3, %xmm1 719; SSE41-NEXT: retq 720; 721; AVX-LABEL: mul_v8i32: 722; AVX: # BB#0: # %entry 723; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 724; AVX-NEXT: retq 725entry: 726 %A = mul <8 x i32> %i, %j 727 ret <8 x i32> %A 728} 729 730define <4 x i64> @mul_v4i64(<4 x i64> %i, <4 x i64> %j) nounwind { 731; SSE-LABEL: mul_v4i64: 732; SSE: # BB#0: # %entry 733; SSE-NEXT: movdqa %xmm0, %xmm4 734; SSE-NEXT: psrlq $32, %xmm4 735; SSE-NEXT: pmuludq %xmm2, %xmm4 736; SSE-NEXT: movdqa %xmm2, %xmm5 737; SSE-NEXT: psrlq $32, %xmm5 738; SSE-NEXT: pmuludq %xmm0, %xmm5 739; SSE-NEXT: paddq %xmm4, %xmm5 740; SSE-NEXT: psllq $32, %xmm5 741; SSE-NEXT: pmuludq %xmm2, %xmm0 742; SSE-NEXT: paddq %xmm5, %xmm0 743; SSE-NEXT: movdqa %xmm1, %xmm2 744; SSE-NEXT: psrlq $32, %xmm2 745; SSE-NEXT: pmuludq %xmm3, %xmm2 746; SSE-NEXT: movdqa %xmm3, %xmm4 747; SSE-NEXT: psrlq $32, %xmm4 748; SSE-NEXT: pmuludq %xmm1, %xmm4 749; SSE-NEXT: paddq %xmm2, %xmm4 750; SSE-NEXT: psllq $32, %xmm4 751; SSE-NEXT: pmuludq %xmm3, %xmm1 752; SSE-NEXT: paddq %xmm4, %xmm1 753; SSE-NEXT: retq 754; 755; AVX-LABEL: mul_v4i64: 756; AVX: # BB#0: # %entry 757; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 758; AVX-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 759; AVX-NEXT: vpsrlq $32, %ymm1, %ymm3 760; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 761; AVX-NEXT: vpaddq %ymm2, %ymm3, %ymm2 762; AVX-NEXT: vpsllq $32, %ymm2, %ymm2 763; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 764; AVX-NEXT: vpaddq %ymm2, %ymm0, %ymm0 765; AVX-NEXT: retq 766entry: 767 %A = mul <4 x i64> %i, %j 768 ret <4 x i64> %A 769} 770 771define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind { 772; SSE2-LABEL: mul_v64i8c: 773; SSE2: # BB#0: # %entry 774; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 775; SSE2-NEXT: psraw $8, %xmm4 776; SSE2-NEXT: movdqa %xmm0, %xmm6 777; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 778; SSE2-NEXT: psraw $8, %xmm6 779; SSE2-NEXT: pmullw %xmm4, %xmm6 780; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] 781; SSE2-NEXT: pand %xmm5, %xmm6 782; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 783; SSE2-NEXT: psraw $8, %xmm0 784; SSE2-NEXT: pmullw %xmm4, %xmm0 785; SSE2-NEXT: pand %xmm5, %xmm0 786; SSE2-NEXT: packuswb %xmm6, %xmm0 787; SSE2-NEXT: movdqa %xmm1, %xmm6 788; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 789; SSE2-NEXT: psraw $8, %xmm6 790; SSE2-NEXT: pmullw %xmm4, %xmm6 791; SSE2-NEXT: pand %xmm5, %xmm6 792; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 793; SSE2-NEXT: psraw $8, %xmm1 794; SSE2-NEXT: pmullw %xmm4, %xmm1 795; SSE2-NEXT: pand %xmm5, %xmm1 796; SSE2-NEXT: packuswb %xmm6, %xmm1 797; SSE2-NEXT: movdqa %xmm2, %xmm6 798; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 799; SSE2-NEXT: psraw $8, %xmm6 800; SSE2-NEXT: pmullw %xmm4, %xmm6 801; SSE2-NEXT: pand %xmm5, %xmm6 802; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 803; SSE2-NEXT: psraw $8, %xmm2 804; SSE2-NEXT: pmullw %xmm4, %xmm2 805; SSE2-NEXT: pand %xmm5, %xmm2 806; SSE2-NEXT: packuswb %xmm6, %xmm2 807; SSE2-NEXT: movdqa %xmm3, %xmm6 808; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 809; SSE2-NEXT: psraw $8, %xmm6 810; SSE2-NEXT: pmullw %xmm4, %xmm6 811; SSE2-NEXT: pand %xmm5, %xmm6 812; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 813; SSE2-NEXT: psraw $8, %xmm3 814; SSE2-NEXT: pmullw %xmm4, %xmm3 815; SSE2-NEXT: pand %xmm5, %xmm3 816; SSE2-NEXT: packuswb %xmm6, %xmm3 817; SSE2-NEXT: retq 818; 819; SSE41-LABEL: mul_v64i8c: 820; SSE41: # BB#0: # %entry 821; SSE41-NEXT: movdqa %xmm1, %xmm4 822; SSE41-NEXT: movdqa %xmm0, %xmm1 823; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 824; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117] 825; SSE41-NEXT: pmullw %xmm6, %xmm0 826; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255] 827; SSE41-NEXT: pand %xmm7, %xmm0 828; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 829; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 830; SSE41-NEXT: pmullw %xmm6, %xmm1 831; SSE41-NEXT: pand %xmm7, %xmm1 832; SSE41-NEXT: packuswb %xmm1, %xmm0 833; SSE41-NEXT: pmovsxbw %xmm4, %xmm1 834; SSE41-NEXT: pmullw %xmm6, %xmm1 835; SSE41-NEXT: pand %xmm7, %xmm1 836; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 837; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 838; SSE41-NEXT: pmullw %xmm6, %xmm4 839; SSE41-NEXT: pand %xmm7, %xmm4 840; SSE41-NEXT: packuswb %xmm4, %xmm1 841; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 842; SSE41-NEXT: pmullw %xmm6, %xmm4 843; SSE41-NEXT: pand %xmm7, %xmm4 844; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 845; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 846; SSE41-NEXT: pmullw %xmm6, %xmm2 847; SSE41-NEXT: pand %xmm7, %xmm2 848; SSE41-NEXT: packuswb %xmm2, %xmm4 849; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 850; SSE41-NEXT: pmullw %xmm6, %xmm5 851; SSE41-NEXT: pand %xmm7, %xmm5 852; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] 853; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 854; SSE41-NEXT: pmullw %xmm6, %xmm2 855; SSE41-NEXT: pand %xmm7, %xmm2 856; SSE41-NEXT: packuswb %xmm2, %xmm5 857; SSE41-NEXT: movdqa %xmm4, %xmm2 858; SSE41-NEXT: movdqa %xmm5, %xmm3 859; SSE41-NEXT: retq 860; 861; AVX2-LABEL: mul_v64i8c: 862; AVX2: # BB#0: # %entry 863; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 864; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 865; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 866; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 867; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 868; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 869; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 870; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 871; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 872; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 873; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 874; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 875; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 876; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0 877; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] 878; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 879; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 880; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 881; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 882; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 883; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4 884; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2 885; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0] 886; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 887; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 888; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 889; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 890; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1 891; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 892; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 893; AVX2-NEXT: retq 894; 895; AVX512F-LABEL: mul_v64i8c: 896; AVX512F: # BB#0: # %entry 897; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 898; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 899; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 900; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 901; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 902; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 903; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 904; AVX512F-NEXT: vpmullw %ymm3, %ymm0, %ymm0 905; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 906; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 907; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 908; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm2 909; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 910; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 911; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 912; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 913; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 914; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 915; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 916; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 917; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 918; AVX512F-NEXT: retq 919; 920; AVX512BW-LABEL: mul_v64i8c: 921; AVX512BW: # BB#0: # %entry 922; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm1 923; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] 924; AVX512BW-NEXT: vpmullw %zmm2, %zmm1, %zmm1 925; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 926; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 927; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 928; AVX512BW-NEXT: vpmullw %zmm2, %zmm0, %zmm0 929; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 930; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 931; AVX512BW-NEXT: retq 932entry: 933 %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 > 934 ret <64 x i8> %A 935} 936 937define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind { 938; SSE2-LABEL: mul_v64i8: 939; SSE2: # BB#0: # %entry 940; SSE2-NEXT: movdqa %xmm4, %xmm8 941; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 942; SSE2-NEXT: psraw $8, %xmm8 943; SSE2-NEXT: movdqa %xmm0, %xmm9 944; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 945; SSE2-NEXT: psraw $8, %xmm9 946; SSE2-NEXT: pmullw %xmm8, %xmm9 947; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] 948; SSE2-NEXT: pand %xmm8, %xmm9 949; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 950; SSE2-NEXT: psraw $8, %xmm4 951; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 952; SSE2-NEXT: psraw $8, %xmm0 953; SSE2-NEXT: pmullw %xmm4, %xmm0 954; SSE2-NEXT: pand %xmm8, %xmm0 955; SSE2-NEXT: packuswb %xmm9, %xmm0 956; SSE2-NEXT: movdqa %xmm5, %xmm9 957; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 958; SSE2-NEXT: psraw $8, %xmm9 959; SSE2-NEXT: movdqa %xmm1, %xmm4 960; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 961; SSE2-NEXT: psraw $8, %xmm4 962; SSE2-NEXT: pmullw %xmm9, %xmm4 963; SSE2-NEXT: pand %xmm8, %xmm4 964; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 965; SSE2-NEXT: psraw $8, %xmm5 966; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 967; SSE2-NEXT: psraw $8, %xmm1 968; SSE2-NEXT: pmullw %xmm5, %xmm1 969; SSE2-NEXT: pand %xmm8, %xmm1 970; SSE2-NEXT: packuswb %xmm4, %xmm1 971; SSE2-NEXT: movdqa %xmm6, %xmm4 972; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 973; SSE2-NEXT: psraw $8, %xmm4 974; SSE2-NEXT: movdqa %xmm2, %xmm5 975; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 976; SSE2-NEXT: psraw $8, %xmm5 977; SSE2-NEXT: pmullw %xmm4, %xmm5 978; SSE2-NEXT: pand %xmm8, %xmm5 979; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 980; SSE2-NEXT: psraw $8, %xmm6 981; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 982; SSE2-NEXT: psraw $8, %xmm2 983; SSE2-NEXT: pmullw %xmm6, %xmm2 984; SSE2-NEXT: pand %xmm8, %xmm2 985; SSE2-NEXT: packuswb %xmm5, %xmm2 986; SSE2-NEXT: movdqa %xmm7, %xmm4 987; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 988; SSE2-NEXT: psraw $8, %xmm4 989; SSE2-NEXT: movdqa %xmm3, %xmm5 990; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 991; SSE2-NEXT: psraw $8, %xmm5 992; SSE2-NEXT: pmullw %xmm4, %xmm5 993; SSE2-NEXT: pand %xmm8, %xmm5 994; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 995; SSE2-NEXT: psraw $8, %xmm7 996; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 997; SSE2-NEXT: psraw $8, %xmm3 998; SSE2-NEXT: pmullw %xmm7, %xmm3 999; SSE2-NEXT: pand %xmm8, %xmm3 1000; SSE2-NEXT: packuswb %xmm5, %xmm3 1001; SSE2-NEXT: retq 1002; 1003; SSE41-LABEL: mul_v64i8: 1004; SSE41: # BB#0: # %entry 1005; SSE41-NEXT: movdqa %xmm1, %xmm8 1006; SSE41-NEXT: movdqa %xmm0, %xmm1 1007; SSE41-NEXT: pmovsxbw %xmm4, %xmm9 1008; SSE41-NEXT: pmovsxbw %xmm1, %xmm0 1009; SSE41-NEXT: pmullw %xmm9, %xmm0 1010; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] 1011; SSE41-NEXT: pand %xmm9, %xmm0 1012; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] 1013; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 1014; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] 1015; SSE41-NEXT: pmovsxbw %xmm1, %xmm1 1016; SSE41-NEXT: pmullw %xmm4, %xmm1 1017; SSE41-NEXT: pand %xmm9, %xmm1 1018; SSE41-NEXT: packuswb %xmm1, %xmm0 1019; SSE41-NEXT: pmovsxbw %xmm5, %xmm4 1020; SSE41-NEXT: pmovsxbw %xmm8, %xmm1 1021; SSE41-NEXT: pmullw %xmm4, %xmm1 1022; SSE41-NEXT: pand %xmm9, %xmm1 1023; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] 1024; SSE41-NEXT: pmovsxbw %xmm4, %xmm4 1025; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] 1026; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 1027; SSE41-NEXT: pmullw %xmm4, %xmm5 1028; SSE41-NEXT: pand %xmm9, %xmm5 1029; SSE41-NEXT: packuswb %xmm5, %xmm1 1030; SSE41-NEXT: pmovsxbw %xmm6, %xmm5 1031; SSE41-NEXT: pmovsxbw %xmm2, %xmm4 1032; SSE41-NEXT: pmullw %xmm5, %xmm4 1033; SSE41-NEXT: pand %xmm9, %xmm4 1034; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] 1035; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 1036; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] 1037; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1038; SSE41-NEXT: pmullw %xmm5, %xmm2 1039; SSE41-NEXT: pand %xmm9, %xmm2 1040; SSE41-NEXT: packuswb %xmm2, %xmm4 1041; SSE41-NEXT: pmovsxbw %xmm7, %xmm2 1042; SSE41-NEXT: pmovsxbw %xmm3, %xmm5 1043; SSE41-NEXT: pmullw %xmm2, %xmm5 1044; SSE41-NEXT: pand %xmm9, %xmm5 1045; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1] 1046; SSE41-NEXT: pmovsxbw %xmm2, %xmm2 1047; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] 1048; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 1049; SSE41-NEXT: pmullw %xmm2, %xmm3 1050; SSE41-NEXT: pand %xmm9, %xmm3 1051; SSE41-NEXT: packuswb %xmm3, %xmm5 1052; SSE41-NEXT: movdqa %xmm4, %xmm2 1053; SSE41-NEXT: movdqa %xmm5, %xmm3 1054; SSE41-NEXT: retq 1055; 1056; AVX2-LABEL: mul_v64i8: 1057; AVX2: # BB#0: # %entry 1058; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4 1059; AVX2-NEXT: vpmovsxbw %xmm4, %ymm4 1060; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 1061; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 1062; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm5 1063; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 1064; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> 1065; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm6 1066; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1067; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0] 1068; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 1069; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 1070; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1071; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 1072; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1073; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 1074; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] 1075; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 1076; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2 1077; AVX2-NEXT: vpmovsxbw %xmm2, %ymm2 1078; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5 1079; AVX2-NEXT: vpmovsxbw %xmm5, %ymm5 1080; AVX2-NEXT: vpmullw %ymm2, %ymm5, %ymm2 1081; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 1082; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 1083; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 1084; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0] 1085; AVX2-NEXT: vpmovsxbw %xmm3, %ymm3 1086; AVX2-NEXT: vpmovsxbw %xmm1, %ymm1 1087; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1088; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1089; AVX2-NEXT: vpshufb %xmm4, %xmm3, %xmm3 1090; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 1091; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 1092; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 1093; AVX2-NEXT: retq 1094; 1095; AVX512F-LABEL: mul_v64i8: 1096; AVX512F: # BB#0: # %entry 1097; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm4 1098; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm5 1099; AVX512F-NEXT: vpmullw %ymm4, %ymm5, %ymm4 1100; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 1101; AVX512F-NEXT: vpmovdb %zmm4, %xmm4 1102; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm2 1103; AVX512F-NEXT: vpmovsxbw %xmm2, %ymm2 1104; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 1105; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm0 1106; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 1107; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 1108; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 1109; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0 1110; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm2 1111; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm4 1112; AVX512F-NEXT: vpmullw %ymm2, %ymm4, %ymm2 1113; AVX512F-NEXT: vpmovsxwd %ymm2, %zmm2 1114; AVX512F-NEXT: vpmovdb %zmm2, %xmm2 1115; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm3 1116; AVX512F-NEXT: vpmovsxbw %xmm3, %ymm3 1117; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1 1118; AVX512F-NEXT: vpmovsxbw %xmm1, %ymm1 1119; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 1120; AVX512F-NEXT: vpmovsxwd %ymm1, %zmm1 1121; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 1122; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 1123; AVX512F-NEXT: retq 1124; 1125; AVX512BW-LABEL: mul_v64i8: 1126; AVX512BW: # BB#0: # %entry 1127; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm2 1128; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm3 1129; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 1130; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2 1131; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 1132; AVX512BW-NEXT: vpmovsxbw %ymm1, %zmm1 1133; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 1134; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 1135; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 1136; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 1137; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 1138; AVX512BW-NEXT: retq 1139entry: 1140 %A = mul <64 x i8> %i, %j 1141 ret <64 x i8> %A 1142} 1143 1144; PR30845 1145define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { 1146; SSE2-LABEL: mul_v4i64_zero_upper: 1147; SSE2: # BB#0: # %entry 1148; SSE2-NEXT: pxor %xmm3, %xmm3 1149; SSE2-NEXT: movdqa %xmm0, %xmm2 1150; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 1151; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1152; SSE2-NEXT: movdqa %xmm1, %xmm4 1153; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1154; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 1155; SSE2-NEXT: pmuludq %xmm0, %xmm1 1156; SSE2-NEXT: pmuludq %xmm4, %xmm2 1157; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] 1158; SSE2-NEXT: movaps %xmm2, %xmm0 1159; SSE2-NEXT: retq 1160; 1161; SSE41-LABEL: mul_v4i64_zero_upper: 1162; SSE41: # BB#0: # %entry 1163; SSE41-NEXT: pxor %xmm3, %xmm3 1164; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero 1165; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1166; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero 1167; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] 1168; SSE41-NEXT: pmuludq %xmm0, %xmm1 1169; SSE41-NEXT: pmuludq %xmm4, %xmm2 1170; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] 1171; SSE41-NEXT: movaps %xmm2, %xmm0 1172; SSE41-NEXT: retq 1173; 1174; AVX2-LABEL: mul_v4i64_zero_upper: 1175; AVX2: # BB#0: # %entry 1176; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1177; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1178; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1179; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1180; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1181; AVX2-NEXT: vzeroupper 1182; AVX2-NEXT: retq 1183; 1184; AVX512-LABEL: mul_v4i64_zero_upper: 1185; AVX512: # BB#0: # %entry 1186; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1187; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1188; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1189; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1190; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1191; AVX512-NEXT: retq 1192entry: 1193 %val1a = zext <4 x i32> %val1 to <4 x i64> 1194 %val2a = zext <4 x i32> %val2 to <4 x i64> 1195 %res64 = mul <4 x i64> %val1a, %val2a 1196 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1197 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1198 ret <4 x i32> %res 1199} 1200 1201define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) { 1202; SSE2-LABEL: mul_v4i64_zero_upper_left: 1203; SSE2: # BB#0: # %entry 1204; SSE2-NEXT: pxor %xmm3, %xmm3 1205; SSE2-NEXT: movdqa %xmm0, %xmm4 1206; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 1207; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1208; SSE2-NEXT: movdqa %xmm0, %xmm3 1209; SSE2-NEXT: pmuludq %xmm2, %xmm3 1210; SSE2-NEXT: psrlq $32, %xmm2 1211; SSE2-NEXT: pmuludq %xmm0, %xmm2 1212; SSE2-NEXT: psllq $32, %xmm2 1213; SSE2-NEXT: paddq %xmm3, %xmm2 1214; SSE2-NEXT: movdqa %xmm4, %xmm0 1215; SSE2-NEXT: pmuludq %xmm1, %xmm0 1216; SSE2-NEXT: psrlq $32, %xmm1 1217; SSE2-NEXT: pmuludq %xmm4, %xmm1 1218; SSE2-NEXT: psllq $32, %xmm1 1219; SSE2-NEXT: paddq %xmm1, %xmm0 1220; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 1221; SSE2-NEXT: retq 1222; 1223; SSE41-LABEL: mul_v4i64_zero_upper_left: 1224; SSE41: # BB#0: # %entry 1225; SSE41-NEXT: pxor %xmm3, %xmm3 1226; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero 1227; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] 1228; SSE41-NEXT: movdqa %xmm0, %xmm3 1229; SSE41-NEXT: pmuludq %xmm2, %xmm3 1230; SSE41-NEXT: psrlq $32, %xmm2 1231; SSE41-NEXT: pmuludq %xmm0, %xmm2 1232; SSE41-NEXT: psllq $32, %xmm2 1233; SSE41-NEXT: paddq %xmm3, %xmm2 1234; SSE41-NEXT: movdqa %xmm4, %xmm0 1235; SSE41-NEXT: pmuludq %xmm1, %xmm0 1236; SSE41-NEXT: psrlq $32, %xmm1 1237; SSE41-NEXT: pmuludq %xmm4, %xmm1 1238; SSE41-NEXT: psllq $32, %xmm1 1239; SSE41-NEXT: paddq %xmm1, %xmm0 1240; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] 1241; SSE41-NEXT: retq 1242; 1243; AVX2-LABEL: mul_v4i64_zero_upper_left: 1244; AVX2: # BB#0: # %entry 1245; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1246; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1247; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 1248; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1249; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 1250; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1251; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1252; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1253; AVX2-NEXT: vzeroupper 1254; AVX2-NEXT: retq 1255; 1256; AVX512-LABEL: mul_v4i64_zero_upper_left: 1257; AVX512: # BB#0: # %entry 1258; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1259; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 1260; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1 1261; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1262; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 1263; AVX512-NEXT: vpaddq %ymm0, %ymm2, %ymm0 1264; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1265; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1266; AVX512-NEXT: retq 1267entry: 1268 %val1a = zext <4 x i32> %val1 to <4 x i64> 1269 %res64 = mul <4 x i64> %val1a, %val2 1270 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1271 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1272 ret <4 x i32> %res 1273} 1274 1275define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) { 1276; SSE2-LABEL: mul_v4i64_zero_lower: 1277; SSE2: # BB#0: # %entry 1278; SSE2-NEXT: pxor %xmm4, %xmm4 1279; SSE2-NEXT: movdqa %xmm0, %xmm3 1280; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] 1281; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1282; SSE2-NEXT: psrlq $32, %xmm2 1283; SSE2-NEXT: pmuludq %xmm0, %xmm2 1284; SSE2-NEXT: psllq $32, %xmm2 1285; SSE2-NEXT: psrlq $32, %xmm1 1286; SSE2-NEXT: pmuludq %xmm1, %xmm3 1287; SSE2-NEXT: psllq $32, %xmm3 1288; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1289; SSE2-NEXT: movaps %xmm3, %xmm0 1290; SSE2-NEXT: retq 1291; 1292; SSE41-LABEL: mul_v4i64_zero_lower: 1293; SSE41: # BB#0: # %entry 1294; SSE41-NEXT: pxor %xmm4, %xmm4 1295; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero 1296; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1297; SSE41-NEXT: psrlq $32, %xmm2 1298; SSE41-NEXT: pmuludq %xmm0, %xmm2 1299; SSE41-NEXT: psllq $32, %xmm2 1300; SSE41-NEXT: psrlq $32, %xmm1 1301; SSE41-NEXT: pmuludq %xmm1, %xmm3 1302; SSE41-NEXT: psllq $32, %xmm3 1303; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3] 1304; SSE41-NEXT: movaps %xmm3, %xmm0 1305; SSE41-NEXT: retq 1306; 1307; AVX2-LABEL: mul_v4i64_zero_lower: 1308; AVX2: # BB#0: # %entry 1309; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1310; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 1311; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1312; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0 1313; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 1314; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1315; AVX2-NEXT: vzeroupper 1316; AVX2-NEXT: retq 1317; 1318; AVX512-LABEL: mul_v4i64_zero_lower: 1319; AVX512: # BB#0: # %entry 1320; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1321; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1 1322; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1323; AVX512-NEXT: vpsllq $32, %ymm0, %ymm0 1324; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 1325; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] 1326; AVX512-NEXT: retq 1327entry: 1328 %val1a = zext <4 x i32> %val1 to <4 x i64> 1329 %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296> 1330 %res64 = mul <4 x i64> %val1a, %val2a 1331 %rescast = bitcast <4 x i64> %res64 to <8 x i32> 1332 %res = shufflevector <8 x i32> %rescast, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1333 ret <4 x i32> %res 1334} 1335 1336define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { 1337; SSE2-LABEL: mul_v8i64_zero_upper: 1338; SSE2: # BB#0: # %entry 1339; SSE2-NEXT: pxor %xmm6, %xmm6 1340; SSE2-NEXT: movdqa %xmm0, %xmm4 1341; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 1342; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] 1343; SSE2-NEXT: movdqa %xmm1, %xmm5 1344; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 1345; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 1346; SSE2-NEXT: movdqa %xmm2, %xmm8 1347; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] 1348; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] 1349; SSE2-NEXT: movdqa %xmm3, %xmm7 1350; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 1351; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] 1352; SSE2-NEXT: pmuludq %xmm1, %xmm3 1353; SSE2-NEXT: pmuludq %xmm7, %xmm5 1354; SSE2-NEXT: pmuludq %xmm0, %xmm2 1355; SSE2-NEXT: pmuludq %xmm8, %xmm4 1356; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] 1357; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] 1358; SSE2-NEXT: movaps %xmm4, %xmm0 1359; SSE2-NEXT: movaps %xmm5, %xmm1 1360; SSE2-NEXT: retq 1361; 1362; SSE41-LABEL: mul_v8i64_zero_upper: 1363; SSE41: # BB#0: # %entry 1364; SSE41-NEXT: pxor %xmm6, %xmm6 1365; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero 1366; SSE41-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3] 1367; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero 1368; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] 1369; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero 1370; SSE41-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] 1371; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero 1372; SSE41-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] 1373; SSE41-NEXT: pmuludq %xmm1, %xmm3 1374; SSE41-NEXT: pmuludq %xmm0, %xmm2 1375; SSE41-NEXT: pmuludq %xmm7, %xmm5 1376; SSE41-NEXT: pmuludq %xmm8, %xmm4 1377; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] 1378; SSE41-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] 1379; SSE41-NEXT: movaps %xmm4, %xmm0 1380; SSE41-NEXT: movaps %xmm5, %xmm1 1381; SSE41-NEXT: retq 1382; 1383; AVX2-LABEL: mul_v8i64_zero_upper: 1384; AVX2: # BB#0: # %entry 1385; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1386; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 1387; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1388; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1389; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 1390; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1391; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 1392; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 1393; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] 1394; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1395; AVX2-NEXT: retq 1396; 1397; AVX512-LABEL: mul_v8i64_zero_upper: 1398; AVX512: # BB#0: # %entry 1399; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero 1400; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 1401; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 1402; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 1403; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] 1404; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 1405; AVX512-NEXT: retq 1406entry: 1407 %val1a = zext <8 x i32> %val1 to <8 x i64> 1408 %val2a = zext <8 x i32> %val2 to <8 x i64> 1409 %res64 = mul <8 x i64> %val1a, %val2a 1410 %rescast = bitcast <8 x i64> %res64 to <16 x i32> 1411 %res = shufflevector <16 x i32> %rescast, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7,i32 9, i32 11, i32 13, i32 15 > 1412 ret <8 x i32> %res 1413} 1414 1415define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { 1416; SSE2-LABEL: mul_v8i64_sext: 1417; SSE2: # BB#0: 1418; SSE2-NEXT: movdqa %xmm1, %xmm4 1419; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 1420; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] 1421; SSE2-NEXT: movdqa %xmm8, %xmm1 1422; SSE2-NEXT: psrad $31, %xmm1 1423; SSE2-NEXT: psrad $16, %xmm8 1424; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 1425; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] 1426; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] 1427; SSE2-NEXT: movdqa %xmm9, %xmm1 1428; SSE2-NEXT: psrad $31, %xmm1 1429; SSE2-NEXT: psrad $16, %xmm9 1430; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1] 1431; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] 1432; SSE2-NEXT: movdqa %xmm7, %xmm1 1433; SSE2-NEXT: psrad $31, %xmm1 1434; SSE2-NEXT: psrad $16, %xmm7 1435; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] 1436; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1437; SSE2-NEXT: movdqa %xmm0, %xmm1 1438; SSE2-NEXT: psrad $31, %xmm1 1439; SSE2-NEXT: psrad $16, %xmm0 1440; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1441; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] 1442; SSE2-NEXT: movdqa %xmm3, %xmm1 1443; SSE2-NEXT: psrad $31, %xmm1 1444; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 1445; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] 1446; SSE2-NEXT: movdqa %xmm1, %xmm5 1447; SSE2-NEXT: psrad $31, %xmm5 1448; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 1449; SSE2-NEXT: movdqa %xmm2, %xmm5 1450; SSE2-NEXT: psrad $31, %xmm5 1451; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 1452; SSE2-NEXT: movdqa %xmm4, %xmm5 1453; SSE2-NEXT: psrad $31, %xmm5 1454; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] 1455; SSE2-NEXT: movdqa %xmm4, %xmm5 1456; SSE2-NEXT: psrlq $32, %xmm5 1457; SSE2-NEXT: pmuludq %xmm0, %xmm5 1458; SSE2-NEXT: movdqa %xmm0, %xmm6 1459; SSE2-NEXT: psrlq $32, %xmm6 1460; SSE2-NEXT: pmuludq %xmm4, %xmm6 1461; SSE2-NEXT: paddq %xmm5, %xmm6 1462; SSE2-NEXT: psllq $32, %xmm6 1463; SSE2-NEXT: pmuludq %xmm4, %xmm0 1464; SSE2-NEXT: paddq %xmm6, %xmm0 1465; SSE2-NEXT: movdqa %xmm2, %xmm4 1466; SSE2-NEXT: psrlq $32, %xmm4 1467; SSE2-NEXT: pmuludq %xmm7, %xmm4 1468; SSE2-NEXT: movdqa %xmm7, %xmm5 1469; SSE2-NEXT: psrlq $32, %xmm5 1470; SSE2-NEXT: pmuludq %xmm2, %xmm5 1471; SSE2-NEXT: paddq %xmm4, %xmm5 1472; SSE2-NEXT: psllq $32, %xmm5 1473; SSE2-NEXT: pmuludq %xmm7, %xmm2 1474; SSE2-NEXT: paddq %xmm5, %xmm2 1475; SSE2-NEXT: movdqa %xmm1, %xmm4 1476; SSE2-NEXT: psrlq $32, %xmm4 1477; SSE2-NEXT: pmuludq %xmm9, %xmm4 1478; SSE2-NEXT: movdqa %xmm9, %xmm5 1479; SSE2-NEXT: psrlq $32, %xmm5 1480; SSE2-NEXT: pmuludq %xmm1, %xmm5 1481; SSE2-NEXT: paddq %xmm4, %xmm5 1482; SSE2-NEXT: psllq $32, %xmm5 1483; SSE2-NEXT: pmuludq %xmm9, %xmm1 1484; SSE2-NEXT: paddq %xmm5, %xmm1 1485; SSE2-NEXT: movdqa %xmm3, %xmm4 1486; SSE2-NEXT: psrlq $32, %xmm4 1487; SSE2-NEXT: pmuludq %xmm8, %xmm4 1488; SSE2-NEXT: movdqa %xmm8, %xmm5 1489; SSE2-NEXT: psrlq $32, %xmm5 1490; SSE2-NEXT: pmuludq %xmm3, %xmm5 1491; SSE2-NEXT: paddq %xmm4, %xmm5 1492; SSE2-NEXT: psllq $32, %xmm5 1493; SSE2-NEXT: pmuludq %xmm8, %xmm3 1494; SSE2-NEXT: paddq %xmm5, %xmm3 1495; SSE2-NEXT: retq 1496; 1497; SSE41-LABEL: mul_v8i64_sext: 1498; SSE41: # BB#0: 1499; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 1500; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 1501; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] 1502; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 1503; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] 1504; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 1505; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 1506; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] 1507; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 1508; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 1509; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] 1510; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 1511; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 1512; SSE41-NEXT: pmuldq %xmm5, %xmm0 1513; SSE41-NEXT: pmuldq %xmm7, %xmm4 1514; SSE41-NEXT: pmuldq %xmm6, %xmm2 1515; SSE41-NEXT: pmuldq %xmm8, %xmm3 1516; SSE41-NEXT: movdqa %xmm4, %xmm1 1517; SSE41-NEXT: retq 1518; 1519; AVX2-LABEL: mul_v8i64_sext: 1520; AVX2: # BB#0: 1521; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 1522; AVX2-NEXT: vpmovsxwq %xmm2, %ymm2 1523; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 1524; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 1525; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 1526; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 1527; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 1528; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 1529; AVX2-NEXT: retq 1530; 1531; AVX512-LABEL: mul_v8i64_sext: 1532; AVX512: # BB#0: 1533; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 1534; AVX512-NEXT: vpmovsxdq %ymm1, %zmm1 1535; AVX512-NEXT: vpmuldq %zmm1, %zmm0, %zmm0 1536; AVX512-NEXT: retq 1537 %1 = sext <8 x i16> %val1 to <8 x i64> 1538 %2 = sext <8 x i32> %val2 to <8 x i64> 1539 %3 = mul <8 x i64> %1, %2 1540 ret <8 x i64> %3 1541} 1542