1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX1 3; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX --check-prefix=X32-AVX2 4; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512F 5; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512BW 6; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512 --check-prefix=X32-AVX512DQ 7; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 8; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 9; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512F 10; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512BW 11; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512 --check-prefix=X64-AVX512DQ 12 13; 14; Subvector Load + Broadcast 15; 16 17define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind { 18; X32-AVX-LABEL: test_broadcast_2f64_4f64: 19; X32-AVX: ## BB#0: 20; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 21; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 22; X32-AVX-NEXT: retl 23; 24; X32-AVX512F-LABEL: test_broadcast_2f64_4f64: 25; X32-AVX512F: ## BB#0: 26; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 27; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 28; X32-AVX512F-NEXT: retl 29; 30; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64: 31; X32-AVX512BW: ## BB#0: 32; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 33; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 34; X32-AVX512BW-NEXT: retl 35; 36; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64: 37; X32-AVX512DQ: ## BB#0: 38; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 39; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1] 40; X32-AVX512DQ-NEXT: retl 41; 42; X64-AVX-LABEL: test_broadcast_2f64_4f64: 43; X64-AVX: ## BB#0: 44; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 45; X64-AVX-NEXT: retq 46; 47; X64-AVX512F-LABEL: test_broadcast_2f64_4f64: 48; X64-AVX512F: ## BB#0: 49; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 50; X64-AVX512F-NEXT: retq 51; 52; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64: 53; X64-AVX512BW: ## BB#0: 54; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 55; X64-AVX512BW-NEXT: retq 56; 57; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64: 58; X64-AVX512DQ: ## BB#0: 59; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} ymm0 = mem[0,1,0,1] 60; X64-AVX512DQ-NEXT: retq 61 %1 = load <2 x double>, <2 x double> *%p 62 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 63 ret <4 x double> %2 64} 65 66define <8 x double> @test_broadcast_2f64_8f64(<2 x double> *%p) nounwind { 67; X32-AVX-LABEL: test_broadcast_2f64_8f64: 68; X32-AVX: ## BB#0: 69; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 70; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 71; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 72; X32-AVX-NEXT: retl 73; 74; X32-AVX512F-LABEL: test_broadcast_2f64_8f64: 75; X32-AVX512F: ## BB#0: 76; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 77; X32-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 78; X32-AVX512F-NEXT: retl 79; 80; X32-AVX512BW-LABEL: test_broadcast_2f64_8f64: 81; X32-AVX512BW: ## BB#0: 82; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 83; X32-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 84; X32-AVX512BW-NEXT: retl 85; 86; X32-AVX512DQ-LABEL: test_broadcast_2f64_8f64: 87; X32-AVX512DQ: ## BB#0: 88; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 89; X32-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] 90; X32-AVX512DQ-NEXT: retl 91; 92; X64-AVX-LABEL: test_broadcast_2f64_8f64: 93; X64-AVX: ## BB#0: 94; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 95; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 96; X64-AVX-NEXT: retq 97; 98; X64-AVX512F-LABEL: test_broadcast_2f64_8f64: 99; X64-AVX512F: ## BB#0: 100; X64-AVX512F-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 101; X64-AVX512F-NEXT: retq 102; 103; X64-AVX512BW-LABEL: test_broadcast_2f64_8f64: 104; X64-AVX512BW: ## BB#0: 105; X64-AVX512BW-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 106; X64-AVX512BW-NEXT: retq 107; 108; X64-AVX512DQ-LABEL: test_broadcast_2f64_8f64: 109; X64-AVX512DQ: ## BB#0: 110; X64-AVX512DQ-NEXT: vbroadcastf64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] 111; X64-AVX512DQ-NEXT: retq 112 %1 = load <2 x double>, <2 x double> *%p 113 %2 = shufflevector <2 x double> %1, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 114 ret <8 x double> %2 115} 116 117define <8 x double> @test_broadcast_4f64_8f64(<4 x double> *%p) nounwind { 118; X32-AVX-LABEL: test_broadcast_4f64_8f64: 119; X32-AVX: ## BB#0: 120; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 121; X32-AVX-NEXT: vmovaps (%eax), %ymm0 122; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 123; X32-AVX-NEXT: retl 124; 125; X32-AVX512-LABEL: test_broadcast_4f64_8f64: 126; X32-AVX512: ## BB#0: 127; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 128; X32-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 129; X32-AVX512-NEXT: retl 130; 131; X64-AVX-LABEL: test_broadcast_4f64_8f64: 132; X64-AVX: ## BB#0: 133; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 134; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 135; X64-AVX-NEXT: retq 136; 137; X64-AVX512-LABEL: test_broadcast_4f64_8f64: 138; X64-AVX512: ## BB#0: 139; X64-AVX512-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 140; X64-AVX512-NEXT: retq 141 %1 = load <4 x double>, <4 x double> *%p 142 %2 = shufflevector <4 x double> %1, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 143 ret <8 x double> %2 144} 145 146define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind { 147; X32-AVX-LABEL: test_broadcast_2i64_4i64: 148; X32-AVX: ## BB#0: 149; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 150; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 151; X32-AVX-NEXT: retl 152; 153; X32-AVX512F-LABEL: test_broadcast_2i64_4i64: 154; X32-AVX512F: ## BB#0: 155; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 156; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 157; X32-AVX512F-NEXT: retl 158; 159; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64: 160; X32-AVX512BW: ## BB#0: 161; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 162; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 163; X32-AVX512BW-NEXT: retl 164; 165; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64: 166; X32-AVX512DQ: ## BB#0: 167; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 168; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1] 169; X32-AVX512DQ-NEXT: retl 170; 171; X64-AVX-LABEL: test_broadcast_2i64_4i64: 172; X64-AVX: ## BB#0: 173; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 174; X64-AVX-NEXT: retq 175; 176; X64-AVX512F-LABEL: test_broadcast_2i64_4i64: 177; X64-AVX512F: ## BB#0: 178; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 179; X64-AVX512F-NEXT: retq 180; 181; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64: 182; X64-AVX512BW: ## BB#0: 183; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 184; X64-AVX512BW-NEXT: retq 185; 186; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64: 187; X64-AVX512DQ: ## BB#0: 188; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} ymm0 = mem[0,1,0,1] 189; X64-AVX512DQ-NEXT: retq 190 %1 = load <2 x i64>, <2 x i64> *%p 191 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 192 ret <4 x i64> %2 193} 194 195define <8 x i64> @test_broadcast_2i64_8i64(<2 x i64> *%p) nounwind { 196; X32-AVX1-LABEL: test_broadcast_2i64_8i64: 197; X32-AVX1: ## BB#0: 198; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 199; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 200; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 201; X32-AVX1-NEXT: retl 202; 203; X32-AVX2-LABEL: test_broadcast_2i64_8i64: 204; X32-AVX2: ## BB#0: 205; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 206; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 207; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 208; X32-AVX2-NEXT: retl 209; 210; X32-AVX512F-LABEL: test_broadcast_2i64_8i64: 211; X32-AVX512F: ## BB#0: 212; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 213; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 214; X32-AVX512F-NEXT: retl 215; 216; X32-AVX512BW-LABEL: test_broadcast_2i64_8i64: 217; X32-AVX512BW: ## BB#0: 218; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 219; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 220; X32-AVX512BW-NEXT: retl 221; 222; X32-AVX512DQ-LABEL: test_broadcast_2i64_8i64: 223; X32-AVX512DQ: ## BB#0: 224; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 225; X32-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] 226; X32-AVX512DQ-NEXT: retl 227; 228; X64-AVX1-LABEL: test_broadcast_2i64_8i64: 229; X64-AVX1: ## BB#0: 230; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 231; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 232; X64-AVX1-NEXT: retq 233; 234; X64-AVX2-LABEL: test_broadcast_2i64_8i64: 235; X64-AVX2: ## BB#0: 236; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 237; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 238; X64-AVX2-NEXT: retq 239; 240; X64-AVX512F-LABEL: test_broadcast_2i64_8i64: 241; X64-AVX512F: ## BB#0: 242; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 243; X64-AVX512F-NEXT: retq 244; 245; X64-AVX512BW-LABEL: test_broadcast_2i64_8i64: 246; X64-AVX512BW: ## BB#0: 247; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 248; X64-AVX512BW-NEXT: retq 249; 250; X64-AVX512DQ-LABEL: test_broadcast_2i64_8i64: 251; X64-AVX512DQ: ## BB#0: 252; X64-AVX512DQ-NEXT: vbroadcasti64x2 {{.*#+}} zmm0 = mem[0,1,0,1,0,1,0,1] 253; X64-AVX512DQ-NEXT: retq 254 %1 = load <2 x i64>, <2 x i64> *%p 255 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1> 256 ret <8 x i64> %2 257} 258 259define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind { 260; X32-AVX-LABEL: test_broadcast_4i64_8i64: 261; X32-AVX: ## BB#0: 262; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 263; X32-AVX-NEXT: vmovaps (%eax), %ymm0 264; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 265; X32-AVX-NEXT: retl 266; 267; X32-AVX512-LABEL: test_broadcast_4i64_8i64: 268; X32-AVX512: ## BB#0: 269; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 270; X32-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 271; X32-AVX512-NEXT: retl 272; 273; X64-AVX-LABEL: test_broadcast_4i64_8i64: 274; X64-AVX: ## BB#0: 275; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 276; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 277; X64-AVX-NEXT: retq 278; 279; X64-AVX512-LABEL: test_broadcast_4i64_8i64: 280; X64-AVX512: ## BB#0: 281; X64-AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 282; X64-AVX512-NEXT: retq 283 %1 = load <4 x i64>, <4 x i64> *%p 284 %2 = shufflevector <4 x i64> %1, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 285 ret <8 x i64> %2 286} 287 288define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind { 289; X32-AVX-LABEL: test_broadcast_4f32_8f32: 290; X32-AVX: ## BB#0: 291; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 292; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 293; X32-AVX-NEXT: retl 294; 295; X32-AVX512-LABEL: test_broadcast_4f32_8f32: 296; X32-AVX512: ## BB#0: 297; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 298; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 299; X32-AVX512-NEXT: retl 300; 301; X64-AVX-LABEL: test_broadcast_4f32_8f32: 302; X64-AVX: ## BB#0: 303; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 304; X64-AVX-NEXT: retq 305; 306; X64-AVX512-LABEL: test_broadcast_4f32_8f32: 307; X64-AVX512: ## BB#0: 308; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 309; X64-AVX512-NEXT: retq 310 %1 = load <4 x float>, <4 x float> *%p 311 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 312 ret <8 x float> %2 313} 314 315define <16 x float> @test_broadcast_4f32_16f32(<4 x float> *%p) nounwind { 316; X32-AVX-LABEL: test_broadcast_4f32_16f32: 317; X32-AVX: ## BB#0: 318; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 319; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 320; X32-AVX-NEXT: vmovdqa %ymm0, %ymm1 321; X32-AVX-NEXT: retl 322; 323; X32-AVX512-LABEL: test_broadcast_4f32_16f32: 324; X32-AVX512: ## BB#0: 325; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 326; X32-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 327; X32-AVX512-NEXT: retl 328; 329; X64-AVX-LABEL: test_broadcast_4f32_16f32: 330; X64-AVX: ## BB#0: 331; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 332; X64-AVX-NEXT: vmovdqa %ymm0, %ymm1 333; X64-AVX-NEXT: retq 334; 335; X64-AVX512-LABEL: test_broadcast_4f32_16f32: 336; X64-AVX512: ## BB#0: 337; X64-AVX512-NEXT: vbroadcastf32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 338; X64-AVX512-NEXT: retq 339 %1 = load <4 x float>, <4 x float> *%p 340 %2 = shufflevector <4 x float> %1, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 341 ret <16 x float> %2 342} 343 344define <16 x float> @test_broadcast_8f32_16f32(<8 x float> *%p) nounwind { 345; X32-AVX-LABEL: test_broadcast_8f32_16f32: 346; X32-AVX: ## BB#0: 347; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 348; X32-AVX-NEXT: vmovaps (%eax), %ymm0 349; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 350; X32-AVX-NEXT: retl 351; 352; X32-AVX512F-LABEL: test_broadcast_8f32_16f32: 353; X32-AVX512F: ## BB#0: 354; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 355; X32-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 356; X32-AVX512F-NEXT: retl 357; 358; X32-AVX512BW-LABEL: test_broadcast_8f32_16f32: 359; X32-AVX512BW: ## BB#0: 360; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 361; X32-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 362; X32-AVX512BW-NEXT: retl 363; 364; X32-AVX512DQ-LABEL: test_broadcast_8f32_16f32: 365; X32-AVX512DQ: ## BB#0: 366; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 367; X32-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 368; X32-AVX512DQ-NEXT: retl 369; 370; X64-AVX-LABEL: test_broadcast_8f32_16f32: 371; X64-AVX: ## BB#0: 372; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 373; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 374; X64-AVX-NEXT: retq 375; 376; X64-AVX512F-LABEL: test_broadcast_8f32_16f32: 377; X64-AVX512F: ## BB#0: 378; X64-AVX512F-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 379; X64-AVX512F-NEXT: retq 380; 381; X64-AVX512BW-LABEL: test_broadcast_8f32_16f32: 382; X64-AVX512BW: ## BB#0: 383; X64-AVX512BW-NEXT: vbroadcastf64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 384; X64-AVX512BW-NEXT: retq 385; 386; X64-AVX512DQ-LABEL: test_broadcast_8f32_16f32: 387; X64-AVX512DQ: ## BB#0: 388; X64-AVX512DQ-NEXT: vbroadcastf32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 389; X64-AVX512DQ-NEXT: retq 390 %1 = load <8 x float>, <8 x float> *%p 391 %2 = shufflevector <8 x float> %1, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 392 ret <16 x float> %2 393} 394 395define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind { 396; X32-AVX-LABEL: test_broadcast_4i32_8i32: 397; X32-AVX: ## BB#0: 398; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 399; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 400; X32-AVX-NEXT: retl 401; 402; X32-AVX512-LABEL: test_broadcast_4i32_8i32: 403; X32-AVX512: ## BB#0: 404; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 405; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 406; X32-AVX512-NEXT: retl 407; 408; X64-AVX-LABEL: test_broadcast_4i32_8i32: 409; X64-AVX: ## BB#0: 410; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 411; X64-AVX-NEXT: retq 412; 413; X64-AVX512-LABEL: test_broadcast_4i32_8i32: 414; X64-AVX512: ## BB#0: 415; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 416; X64-AVX512-NEXT: retq 417 %1 = load <4 x i32>, <4 x i32> *%p 418 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 419 ret <8 x i32> %2 420} 421 422define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind { 423; X32-AVX1-LABEL: test_broadcast_4i32_16i32: 424; X32-AVX1: ## BB#0: 425; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 426; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 427; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 428; X32-AVX1-NEXT: retl 429; 430; X32-AVX2-LABEL: test_broadcast_4i32_16i32: 431; X32-AVX2: ## BB#0: 432; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 433; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 434; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 435; X32-AVX2-NEXT: retl 436; 437; X32-AVX512-LABEL: test_broadcast_4i32_16i32: 438; X32-AVX512: ## BB#0: 439; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 440; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 441; X32-AVX512-NEXT: retl 442; 443; X64-AVX1-LABEL: test_broadcast_4i32_16i32: 444; X64-AVX1: ## BB#0: 445; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 446; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 447; X64-AVX1-NEXT: retq 448; 449; X64-AVX2-LABEL: test_broadcast_4i32_16i32: 450; X64-AVX2: ## BB#0: 451; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 452; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 453; X64-AVX2-NEXT: retq 454; 455; X64-AVX512-LABEL: test_broadcast_4i32_16i32: 456; X64-AVX512: ## BB#0: 457; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 458; X64-AVX512-NEXT: retq 459 %1 = load <4 x i32>, <4 x i32> *%p 460 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 461 ret <16 x i32> %2 462} 463 464define <16 x i32> @test_broadcast_8i32_16i32(<8 x i32> *%p) nounwind { 465; X32-AVX-LABEL: test_broadcast_8i32_16i32: 466; X32-AVX: ## BB#0: 467; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 468; X32-AVX-NEXT: vmovaps (%eax), %ymm0 469; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 470; X32-AVX-NEXT: retl 471; 472; X32-AVX512F-LABEL: test_broadcast_8i32_16i32: 473; X32-AVX512F: ## BB#0: 474; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 475; X32-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 476; X32-AVX512F-NEXT: retl 477; 478; X32-AVX512BW-LABEL: test_broadcast_8i32_16i32: 479; X32-AVX512BW: ## BB#0: 480; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 481; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 482; X32-AVX512BW-NEXT: retl 483; 484; X32-AVX512DQ-LABEL: test_broadcast_8i32_16i32: 485; X32-AVX512DQ: ## BB#0: 486; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 487; X32-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 488; X32-AVX512DQ-NEXT: retl 489; 490; X64-AVX-LABEL: test_broadcast_8i32_16i32: 491; X64-AVX: ## BB#0: 492; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 493; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 494; X64-AVX-NEXT: retq 495; 496; X64-AVX512F-LABEL: test_broadcast_8i32_16i32: 497; X64-AVX512F: ## BB#0: 498; X64-AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 499; X64-AVX512F-NEXT: retq 500; 501; X64-AVX512BW-LABEL: test_broadcast_8i32_16i32: 502; X64-AVX512BW: ## BB#0: 503; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 504; X64-AVX512BW-NEXT: retq 505; 506; X64-AVX512DQ-LABEL: test_broadcast_8i32_16i32: 507; X64-AVX512DQ: ## BB#0: 508; X64-AVX512DQ-NEXT: vbroadcasti32x8 {{.*#+}} zmm0 = mem[0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] 509; X64-AVX512DQ-NEXT: retq 510 %1 = load <8 x i32>, <8 x i32> *%p 511 %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 512 ret <16 x i32> %2 513} 514 515define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind { 516; X32-AVX-LABEL: test_broadcast_8i16_16i16: 517; X32-AVX: ## BB#0: 518; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 519; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 520; X32-AVX-NEXT: retl 521; 522; X32-AVX512-LABEL: test_broadcast_8i16_16i16: 523; X32-AVX512: ## BB#0: 524; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 525; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 526; X32-AVX512-NEXT: retl 527; 528; X64-AVX-LABEL: test_broadcast_8i16_16i16: 529; X64-AVX: ## BB#0: 530; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 531; X64-AVX-NEXT: retq 532; 533; X64-AVX512-LABEL: test_broadcast_8i16_16i16: 534; X64-AVX512: ## BB#0: 535; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 536; X64-AVX512-NEXT: retq 537 %1 = load <8 x i16>, <8 x i16> *%p 538 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 539 ret <16 x i16> %2 540} 541 542define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind { 543; X32-AVX1-LABEL: test_broadcast_8i16_32i16: 544; X32-AVX1: ## BB#0: 545; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 546; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 547; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 548; X32-AVX1-NEXT: retl 549; 550; X32-AVX2-LABEL: test_broadcast_8i16_32i16: 551; X32-AVX2: ## BB#0: 552; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 553; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 554; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 555; X32-AVX2-NEXT: retl 556; 557; X32-AVX512F-LABEL: test_broadcast_8i16_32i16: 558; X32-AVX512F: ## BB#0: 559; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 560; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 561; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 562; X32-AVX512F-NEXT: retl 563; 564; X32-AVX512BW-LABEL: test_broadcast_8i16_32i16: 565; X32-AVX512BW: ## BB#0: 566; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 567; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 568; X32-AVX512BW-NEXT: retl 569; 570; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16: 571; X32-AVX512DQ: ## BB#0: 572; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 573; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 574; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 575; X32-AVX512DQ-NEXT: retl 576; 577; X64-AVX1-LABEL: test_broadcast_8i16_32i16: 578; X64-AVX1: ## BB#0: 579; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 580; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 581; X64-AVX1-NEXT: retq 582; 583; X64-AVX2-LABEL: test_broadcast_8i16_32i16: 584; X64-AVX2: ## BB#0: 585; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 586; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 587; X64-AVX2-NEXT: retq 588; 589; X64-AVX512F-LABEL: test_broadcast_8i16_32i16: 590; X64-AVX512F: ## BB#0: 591; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 592; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 593; X64-AVX512F-NEXT: retq 594; 595; X64-AVX512BW-LABEL: test_broadcast_8i16_32i16: 596; X64-AVX512BW: ## BB#0: 597; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 598; X64-AVX512BW-NEXT: retq 599; 600; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16: 601; X64-AVX512DQ: ## BB#0: 602; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 603; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 604; X64-AVX512DQ-NEXT: retq 605 %1 = load <8 x i16>, <8 x i16> *%p 606 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 607 ret <32 x i16> %2 608} 609 610define <32 x i16> @test_broadcast_16i16_32i16(<16 x i16> *%p) nounwind { 611; X32-AVX-LABEL: test_broadcast_16i16_32i16: 612; X32-AVX: ## BB#0: 613; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 614; X32-AVX-NEXT: vmovaps (%eax), %ymm0 615; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 616; X32-AVX-NEXT: retl 617; 618; X32-AVX512F-LABEL: test_broadcast_16i16_32i16: 619; X32-AVX512F: ## BB#0: 620; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 621; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 622; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 623; X32-AVX512F-NEXT: retl 624; 625; X32-AVX512BW-LABEL: test_broadcast_16i16_32i16: 626; X32-AVX512BW: ## BB#0: 627; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 628; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 629; X32-AVX512BW-NEXT: retl 630; 631; X32-AVX512DQ-LABEL: test_broadcast_16i16_32i16: 632; X32-AVX512DQ: ## BB#0: 633; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 634; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 635; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 636; X32-AVX512DQ-NEXT: retl 637; 638; X64-AVX-LABEL: test_broadcast_16i16_32i16: 639; X64-AVX: ## BB#0: 640; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 641; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 642; X64-AVX-NEXT: retq 643; 644; X64-AVX512F-LABEL: test_broadcast_16i16_32i16: 645; X64-AVX512F: ## BB#0: 646; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 647; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 648; X64-AVX512F-NEXT: retq 649; 650; X64-AVX512BW-LABEL: test_broadcast_16i16_32i16: 651; X64-AVX512BW: ## BB#0: 652; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 653; X64-AVX512BW-NEXT: retq 654; 655; X64-AVX512DQ-LABEL: test_broadcast_16i16_32i16: 656; X64-AVX512DQ: ## BB#0: 657; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 658; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 659; X64-AVX512DQ-NEXT: retq 660 %1 = load <16 x i16>, <16 x i16> *%p 661 %2 = shufflevector <16 x i16> %1, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 662 ret <32 x i16> %2 663} 664 665define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind { 666; X32-AVX-LABEL: test_broadcast_16i8_32i8: 667; X32-AVX: ## BB#0: 668; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 669; X32-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 670; X32-AVX-NEXT: retl 671; 672; X32-AVX512-LABEL: test_broadcast_16i8_32i8: 673; X32-AVX512: ## BB#0: 674; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 675; X32-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 676; X32-AVX512-NEXT: retl 677; 678; X64-AVX-LABEL: test_broadcast_16i8_32i8: 679; X64-AVX: ## BB#0: 680; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 681; X64-AVX-NEXT: retq 682; 683; X64-AVX512-LABEL: test_broadcast_16i8_32i8: 684; X64-AVX512: ## BB#0: 685; X64-AVX512-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 686; X64-AVX512-NEXT: retq 687 %1 = load <16 x i8>, <16 x i8> *%p 688 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 689 ret <32 x i8> %2 690} 691 692define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind { 693; X32-AVX1-LABEL: test_broadcast_16i8_64i8: 694; X32-AVX1: ## BB#0: 695; X32-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 696; X32-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 697; X32-AVX1-NEXT: vmovdqa %ymm0, %ymm1 698; X32-AVX1-NEXT: retl 699; 700; X32-AVX2-LABEL: test_broadcast_16i8_64i8: 701; X32-AVX2: ## BB#0: 702; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 703; X32-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 704; X32-AVX2-NEXT: vmovaps %ymm0, %ymm1 705; X32-AVX2-NEXT: retl 706; 707; X32-AVX512F-LABEL: test_broadcast_16i8_64i8: 708; X32-AVX512F: ## BB#0: 709; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 710; X32-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 711; X32-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 712; X32-AVX512F-NEXT: retl 713; 714; X32-AVX512BW-LABEL: test_broadcast_16i8_64i8: 715; X32-AVX512BW: ## BB#0: 716; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 717; X32-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 718; X32-AVX512BW-NEXT: retl 719; 720; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8: 721; X32-AVX512DQ: ## BB#0: 722; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 723; X32-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 724; X32-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 725; X32-AVX512DQ-NEXT: retl 726; 727; X64-AVX1-LABEL: test_broadcast_16i8_64i8: 728; X64-AVX1: ## BB#0: 729; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 730; X64-AVX1-NEXT: vmovdqa %ymm0, %ymm1 731; X64-AVX1-NEXT: retq 732; 733; X64-AVX2-LABEL: test_broadcast_16i8_64i8: 734; X64-AVX2: ## BB#0: 735; X64-AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] 736; X64-AVX2-NEXT: vmovaps %ymm0, %ymm1 737; X64-AVX2-NEXT: retq 738; 739; X64-AVX512F-LABEL: test_broadcast_16i8_64i8: 740; X64-AVX512F: ## BB#0: 741; X64-AVX512F-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 742; X64-AVX512F-NEXT: vmovdqa %ymm0, %ymm1 743; X64-AVX512F-NEXT: retq 744; 745; X64-AVX512BW-LABEL: test_broadcast_16i8_64i8: 746; X64-AVX512BW: ## BB#0: 747; X64-AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] 748; X64-AVX512BW-NEXT: retq 749; 750; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8: 751; X64-AVX512DQ: ## BB#0: 752; X64-AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3] 753; X64-AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1 754; X64-AVX512DQ-NEXT: retq 755 %1 = load <16 x i8>, <16 x i8> *%p 756 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 757 ret <64 x i8> %2 758} 759 760define <64 x i8> @test_broadcast_32i8_64i8(<32 x i8> *%p) nounwind { 761; X32-AVX-LABEL: test_broadcast_32i8_64i8: 762; X32-AVX: ## BB#0: 763; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 764; X32-AVX-NEXT: vmovaps (%eax), %ymm0 765; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 766; X32-AVX-NEXT: retl 767; 768; X32-AVX512F-LABEL: test_broadcast_32i8_64i8: 769; X32-AVX512F: ## BB#0: 770; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 771; X32-AVX512F-NEXT: vmovaps (%eax), %ymm0 772; X32-AVX512F-NEXT: vmovaps %ymm0, %ymm1 773; X32-AVX512F-NEXT: retl 774; 775; X32-AVX512BW-LABEL: test_broadcast_32i8_64i8: 776; X32-AVX512BW: ## BB#0: 777; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 778; X32-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 779; X32-AVX512BW-NEXT: retl 780; 781; X32-AVX512DQ-LABEL: test_broadcast_32i8_64i8: 782; X32-AVX512DQ: ## BB#0: 783; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 784; X32-AVX512DQ-NEXT: vmovaps (%eax), %ymm0 785; X32-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 786; X32-AVX512DQ-NEXT: retl 787; 788; X64-AVX-LABEL: test_broadcast_32i8_64i8: 789; X64-AVX: ## BB#0: 790; X64-AVX-NEXT: vmovaps (%rdi), %ymm0 791; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 792; X64-AVX-NEXT: retq 793; 794; X64-AVX512F-LABEL: test_broadcast_32i8_64i8: 795; X64-AVX512F: ## BB#0: 796; X64-AVX512F-NEXT: vmovaps (%rdi), %ymm0 797; X64-AVX512F-NEXT: vmovaps %ymm0, %ymm1 798; X64-AVX512F-NEXT: retq 799; 800; X64-AVX512BW-LABEL: test_broadcast_32i8_64i8: 801; X64-AVX512BW: ## BB#0: 802; X64-AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3] 803; X64-AVX512BW-NEXT: retq 804; 805; X64-AVX512DQ-LABEL: test_broadcast_32i8_64i8: 806; X64-AVX512DQ: ## BB#0: 807; X64-AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 808; X64-AVX512DQ-NEXT: vmovaps %ymm0, %ymm1 809; X64-AVX512DQ-NEXT: retq 810 %1 = load <32 x i8>, <32 x i8> *%p 811 %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 812 ret <64 x i8> %2 813} 814 815; 816; Subvector Load + Broadcast + Store 817; 818 819define <4 x double> @test_broadcast_2f64_4f64_reuse(<2 x double>* %p0, <2 x double>* %p1) { 820; X32-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 821; X32-AVX: ## BB#0: 822; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 823; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 824; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 825; X32-AVX-NEXT: vmovaps %xmm0, (%eax) 826; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 827; X32-AVX-NEXT: retl 828; 829; X32-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse: 830; X32-AVX512F: ## BB#0: 831; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 832; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 833; X32-AVX512F-NEXT: vmovaps (%ecx), %xmm0 834; X32-AVX512F-NEXT: vmovaps %xmm0, (%eax) 835; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 836; X32-AVX512F-NEXT: retl 837; 838; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: 839; X32-AVX512BW: ## BB#0: 840; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 841; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx 842; X32-AVX512BW-NEXT: vmovaps (%ecx), %xmm0 843; X32-AVX512BW-NEXT: vmovaps %xmm0, (%eax) 844; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 845; X32-AVX512BW-NEXT: retl 846; 847; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: 848; X32-AVX512DQ: ## BB#0: 849; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 850; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx 851; X32-AVX512DQ-NEXT: vmovapd (%ecx), %xmm0 852; X32-AVX512DQ-NEXT: vmovapd %xmm0, (%eax) 853; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 854; X32-AVX512DQ-NEXT: retl 855; 856; X64-AVX-LABEL: test_broadcast_2f64_4f64_reuse: 857; X64-AVX: ## BB#0: 858; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 859; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 860; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 861; X64-AVX-NEXT: retq 862; 863; X64-AVX512F-LABEL: test_broadcast_2f64_4f64_reuse: 864; X64-AVX512F: ## BB#0: 865; X64-AVX512F-NEXT: vmovaps (%rdi), %xmm0 866; X64-AVX512F-NEXT: vmovaps %xmm0, (%rsi) 867; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 868; X64-AVX512F-NEXT: retq 869; 870; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64_reuse: 871; X64-AVX512BW: ## BB#0: 872; X64-AVX512BW-NEXT: vmovaps (%rdi), %xmm0 873; X64-AVX512BW-NEXT: vmovaps %xmm0, (%rsi) 874; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 875; X64-AVX512BW-NEXT: retq 876; 877; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64_reuse: 878; X64-AVX512DQ: ## BB#0: 879; X64-AVX512DQ-NEXT: vmovapd (%rdi), %xmm0 880; X64-AVX512DQ-NEXT: vmovapd %xmm0, (%rsi) 881; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 882; X64-AVX512DQ-NEXT: retq 883 %1 = load <2 x double>, <2 x double>* %p0 884 store <2 x double> %1, <2 x double>* %p1 885 %2 = shufflevector <2 x double> %1, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 886 ret <4 x double> %2 887} 888 889define <4 x i64> @test_broadcast_2i64_4i64_reuse(<2 x i64>* %p0, <2 x i64>* %p1) { 890; X32-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 891; X32-AVX: ## BB#0: 892; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 893; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 894; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 895; X32-AVX-NEXT: vmovaps %xmm0, (%eax) 896; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 897; X32-AVX-NEXT: retl 898; 899; X32-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 900; X32-AVX512: ## BB#0: 901; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 902; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 903; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 904; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) 905; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 906; X32-AVX512-NEXT: retl 907; 908; X64-AVX-LABEL: test_broadcast_2i64_4i64_reuse: 909; X64-AVX: ## BB#0: 910; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 911; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 912; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 913; X64-AVX-NEXT: retq 914; 915; X64-AVX512-LABEL: test_broadcast_2i64_4i64_reuse: 916; X64-AVX512: ## BB#0: 917; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 918; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 919; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 920; X64-AVX512-NEXT: retq 921 %1 = load <2 x i64>, <2 x i64>* %p0 922 store <2 x i64> %1, <2 x i64>* %p1 923 %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 924 ret <4 x i64> %2 925} 926 927define <8 x float> @test_broadcast_4f32_8f32_reuse(<4 x float>* %p0, <4 x float>* %p1) { 928; X32-LABEL: test_broadcast_4f32_8f32_reuse: 929; X32: ## BB#0: 930; X32-NEXT: movl {{[0-9]+}}(%esp), %eax 931; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx 932; X32-NEXT: vmovaps (%ecx), %xmm0 933; X32-NEXT: vmovaps %xmm0, (%eax) 934; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 935; X32-NEXT: retl 936; 937; X64-LABEL: test_broadcast_4f32_8f32_reuse: 938; X64: ## BB#0: 939; X64-NEXT: vmovaps (%rdi), %xmm0 940; X64-NEXT: vmovaps %xmm0, (%rsi) 941; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 942; X64-NEXT: retq 943 %1 = load <4 x float>, <4 x float>* %p0 944 store <4 x float> %1, <4 x float>* %p1 945 %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 946 ret <8 x float> %2 947} 948 949define <8 x i32> @test_broadcast_4i32_8i32_reuse(<4 x i32>* %p0, <4 x i32>* %p1) { 950; X32-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 951; X32-AVX: ## BB#0: 952; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 953; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 954; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 955; X32-AVX-NEXT: vmovaps %xmm0, (%eax) 956; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 957; X32-AVX-NEXT: retl 958; 959; X32-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 960; X32-AVX512: ## BB#0: 961; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax 962; X32-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx 963; X32-AVX512-NEXT: vmovdqa (%ecx), %xmm0 964; X32-AVX512-NEXT: vmovdqa %xmm0, (%eax) 965; X32-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 966; X32-AVX512-NEXT: retl 967; 968; X64-AVX-LABEL: test_broadcast_4i32_8i32_reuse: 969; X64-AVX: ## BB#0: 970; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 971; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 972; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 973; X64-AVX-NEXT: retq 974; 975; X64-AVX512-LABEL: test_broadcast_4i32_8i32_reuse: 976; X64-AVX512: ## BB#0: 977; X64-AVX512-NEXT: vmovdqa (%rdi), %xmm0 978; X64-AVX512-NEXT: vmovdqa %xmm0, (%rsi) 979; X64-AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 980; X64-AVX512-NEXT: retq 981 %1 = load <4 x i32>, <4 x i32>* %p0 982 store <4 x i32> %1, <4 x i32>* %p1 983 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 984 ret <8 x i32> %2 985} 986 987define <16 x i16> @test_broadcast_8i16_16i16_reuse(<8 x i16> *%p0, <8 x i16> *%p1) nounwind { 988; X32-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 989; X32-AVX: ## BB#0: 990; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 991; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 992; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 993; X32-AVX-NEXT: vmovaps %xmm0, (%eax) 994; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 995; X32-AVX-NEXT: retl 996; 997; X32-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse: 998; X32-AVX512F: ## BB#0: 999; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 1000; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 1001; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 1002; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) 1003; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1004; X32-AVX512F-NEXT: retl 1005; 1006; X32-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: 1007; X32-AVX512BW: ## BB#0: 1008; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 1009; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx 1010; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 1011; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) 1012; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1013; X32-AVX512BW-NEXT: retl 1014; 1015; X32-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: 1016; X32-AVX512DQ: ## BB#0: 1017; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 1018; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx 1019; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 1020; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) 1021; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1022; X32-AVX512DQ-NEXT: retl 1023; 1024; X64-AVX-LABEL: test_broadcast_8i16_16i16_reuse: 1025; X64-AVX: ## BB#0: 1026; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 1027; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 1028; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1029; X64-AVX-NEXT: retq 1030; 1031; X64-AVX512F-LABEL: test_broadcast_8i16_16i16_reuse: 1032; X64-AVX512F: ## BB#0: 1033; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1034; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 1035; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1036; X64-AVX512F-NEXT: retq 1037; 1038; X64-AVX512BW-LABEL: test_broadcast_8i16_16i16_reuse: 1039; X64-AVX512BW: ## BB#0: 1040; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 1041; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 1042; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1043; X64-AVX512BW-NEXT: retq 1044; 1045; X64-AVX512DQ-LABEL: test_broadcast_8i16_16i16_reuse: 1046; X64-AVX512DQ: ## BB#0: 1047; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1048; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) 1049; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1050; X64-AVX512DQ-NEXT: retq 1051 %1 = load <8 x i16>, <8 x i16> *%p0 1052 store <8 x i16> %1, <8 x i16>* %p1 1053 %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1054 ret <16 x i16> %2 1055} 1056 1057define <32 x i8> @test_broadcast_16i8_32i8_reuse(<16 x i8> *%p0, <16 x i8> *%p1) nounwind { 1058; X32-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 1059; X32-AVX: ## BB#0: 1060; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1061; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1062; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 1063; X32-AVX-NEXT: vmovaps %xmm0, (%eax) 1064; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1065; X32-AVX-NEXT: retl 1066; 1067; X32-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse: 1068; X32-AVX512F: ## BB#0: 1069; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 1070; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 1071; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 1072; X32-AVX512F-NEXT: vmovdqa %xmm0, (%eax) 1073; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1074; X32-AVX512F-NEXT: retl 1075; 1076; X32-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: 1077; X32-AVX512BW: ## BB#0: 1078; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 1079; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx 1080; X32-AVX512BW-NEXT: vmovdqu (%ecx), %xmm0 1081; X32-AVX512BW-NEXT: vmovdqu %xmm0, (%eax) 1082; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1083; X32-AVX512BW-NEXT: retl 1084; 1085; X32-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: 1086; X32-AVX512DQ: ## BB#0: 1087; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 1088; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx 1089; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 1090; X32-AVX512DQ-NEXT: vmovdqa %xmm0, (%eax) 1091; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1092; X32-AVX512DQ-NEXT: retl 1093; 1094; X64-AVX-LABEL: test_broadcast_16i8_32i8_reuse: 1095; X64-AVX: ## BB#0: 1096; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 1097; X64-AVX-NEXT: vmovaps %xmm0, (%rsi) 1098; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1099; X64-AVX-NEXT: retq 1100; 1101; X64-AVX512F-LABEL: test_broadcast_16i8_32i8_reuse: 1102; X64-AVX512F: ## BB#0: 1103; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1104; X64-AVX512F-NEXT: vmovdqa %xmm0, (%rsi) 1105; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1106; X64-AVX512F-NEXT: retq 1107; 1108; X64-AVX512BW-LABEL: test_broadcast_16i8_32i8_reuse: 1109; X64-AVX512BW: ## BB#0: 1110; X64-AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 1111; X64-AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) 1112; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1113; X64-AVX512BW-NEXT: retq 1114; 1115; X64-AVX512DQ-LABEL: test_broadcast_16i8_32i8_reuse: 1116; X64-AVX512DQ: ## BB#0: 1117; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1118; X64-AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi) 1119; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1120; X64-AVX512DQ-NEXT: retq 1121 %1 = load <16 x i8>, <16 x i8> *%p0 1122 store <16 x i8> %1, <16 x i8>* %p1 1123 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 1124 ret <32 x i8> %2 1125} 1126 1127; 1128; Subvector Load + Broadcast with Separate Store 1129; 1130 1131define <8 x i32> @test_broadcast_4i32_8i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 1132; X32-AVX-LABEL: test_broadcast_4i32_8i32_chain: 1133; X32-AVX: ## BB#0: 1134; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1135; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1136; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 1137; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1138; X32-AVX-NEXT: vmovaps %xmm1, (%eax) 1139; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1140; X32-AVX-NEXT: retl 1141; 1142; X32-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: 1143; X32-AVX512F: ## BB#0: 1144; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 1145; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 1146; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 1147; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1148; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) 1149; X32-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1150; X32-AVX512F-NEXT: retl 1151; 1152; X32-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: 1153; X32-AVX512BW: ## BB#0: 1154; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 1155; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx 1156; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 1157; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1158; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) 1159; X32-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1160; X32-AVX512BW-NEXT: retl 1161; 1162; X32-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: 1163; X32-AVX512DQ: ## BB#0: 1164; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 1165; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx 1166; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 1167; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 1168; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) 1169; X32-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1170; X32-AVX512DQ-NEXT: retl 1171; 1172; X64-AVX-LABEL: test_broadcast_4i32_8i32_chain: 1173; X64-AVX: ## BB#0: 1174; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 1175; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1176; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 1177; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1178; X64-AVX-NEXT: retq 1179; 1180; X64-AVX512F-LABEL: test_broadcast_4i32_8i32_chain: 1181; X64-AVX512F: ## BB#0: 1182; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1183; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1184; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) 1185; X64-AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1186; X64-AVX512F-NEXT: retq 1187; 1188; X64-AVX512BW-LABEL: test_broadcast_4i32_8i32_chain: 1189; X64-AVX512BW: ## BB#0: 1190; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1191; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1192; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) 1193; X64-AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1194; X64-AVX512BW-NEXT: retq 1195; 1196; X64-AVX512DQ-LABEL: test_broadcast_4i32_8i32_chain: 1197; X64-AVX512DQ: ## BB#0: 1198; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1199; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 1200; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) 1201; X64-AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 1202; X64-AVX512DQ-NEXT: retq 1203 %1 = load <4 x i32>, <4 x i32>* %p0 1204 store <4 x float> zeroinitializer, <4 x float>* %p1 1205 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1206 ret <8 x i32> %2 1207} 1208 1209define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* %p1) { 1210; X32-AVX-LABEL: test_broadcast_4i32_16i32_chain: 1211; X32-AVX: ## BB#0: 1212; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1213; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1214; X32-AVX-NEXT: vmovaps (%ecx), %xmm0 1215; X32-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1216; X32-AVX-NEXT: vmovaps %xmm1, (%eax) 1217; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1218; X32-AVX-NEXT: vmovaps %ymm0, %ymm1 1219; X32-AVX-NEXT: retl 1220; 1221; X32-AVX512F-LABEL: test_broadcast_4i32_16i32_chain: 1222; X32-AVX512F: ## BB#0: 1223; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax 1224; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %ecx 1225; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 1226; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1227; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) 1228; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1229; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1230; X32-AVX512F-NEXT: retl 1231; 1232; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: 1233; X32-AVX512BW: ## BB#0: 1234; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %eax 1235; X32-AVX512BW-NEXT: movl {{[0-9]+}}(%esp), %ecx 1236; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 1237; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1238; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) 1239; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1240; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1241; X32-AVX512BW-NEXT: retl 1242; 1243; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: 1244; X32-AVX512DQ: ## BB#0: 1245; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %eax 1246; X32-AVX512DQ-NEXT: movl {{[0-9]+}}(%esp), %ecx 1247; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 1248; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 1249; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) 1250; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1251; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 1252; X32-AVX512DQ-NEXT: retl 1253; 1254; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: 1255; X64-AVX: ## BB#0: 1256; X64-AVX-NEXT: vmovaps (%rdi), %xmm0 1257; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1258; X64-AVX-NEXT: vmovaps %xmm1, (%rsi) 1259; X64-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 1260; X64-AVX-NEXT: vmovaps %ymm0, %ymm1 1261; X64-AVX-NEXT: retq 1262; 1263; X64-AVX512F-LABEL: test_broadcast_4i32_16i32_chain: 1264; X64-AVX512F: ## BB#0: 1265; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 1266; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 1267; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) 1268; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1269; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1270; X64-AVX512F-NEXT: retq 1271; 1272; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: 1273; X64-AVX512BW: ## BB#0: 1274; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 1275; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 1276; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) 1277; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1278; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 1279; X64-AVX512BW-NEXT: retq 1280; 1281; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: 1282; X64-AVX512DQ: ## BB#0: 1283; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 1284; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 1285; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) 1286; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 1287; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 1288; X64-AVX512DQ-NEXT: retq 1289 %1 = load <4 x i32>, <4 x i32>* %p0 1290 store <4 x float> zeroinitializer, <4 x float>* %p1 1291 %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 1292 ret <16 x i32> %2 1293} 1294 1295; 1296; subvector Load with multiple uses + broadcast 1297; Fallback to the broadcast should be done 1298; 1299 1300@ga4 = global <4 x i64> zeroinitializer, align 8 1301@gb4 = global <8 x i64> zeroinitializer, align 8 1302 1303define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) { 1304; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 1305; X32-AVX1: ## BB#0: ## %entry 1306; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1307; X32-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0] 1308; X32-AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 1309; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1310; X32-AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 1311; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1312; X32-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 1313; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1314; X32-AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 1315; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1316; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 1317; X32-AVX1-NEXT: vpaddq %xmm5, %xmm3, %xmm3 1318; X32-AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 1319; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 1320; X32-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 1321; X32-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 1322; X32-AVX1-NEXT: vmovups %ymm0, _ga4 1323; X32-AVX1-NEXT: vmovups %ymm2, _gb4+32 1324; X32-AVX1-NEXT: vmovups %ymm1, _gb4 1325; X32-AVX1-NEXT: vzeroupper 1326; X32-AVX1-NEXT: retl 1327; 1328; X32-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 1329; X32-AVX2: ## BB#0: ## %entry 1330; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,0,2,0,3,0,4,0] 1331; X32-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 1332; X32-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 1333; X32-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 1334; X32-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1335; X32-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1336; X32-AVX2-NEXT: vmovdqu %ymm0, _ga4 1337; X32-AVX2-NEXT: vmovdqu %ymm2, _gb4+32 1338; X32-AVX2-NEXT: vmovdqu %ymm1, _gb4 1339; X32-AVX2-NEXT: vzeroupper 1340; X32-AVX2-NEXT: retl 1341; 1342; X32-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 1343; X32-AVX512: ## BB#0: ## %entry 1344; X32-AVX512-NEXT: vpaddq LCPI26_0, %ymm0, %ymm0 1345; X32-AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,2,0,3,0,4,0,1,0,2,0,3,0,4,0] 1346; X32-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 1347; X32-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 1348; X32-AVX512-NEXT: vmovdqu %ymm0, _ga4 1349; X32-AVX512-NEXT: vmovdqu64 %zmm1, _gb4 1350; X32-AVX512-NEXT: retl 1351; 1352; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64: 1353; X64-AVX1: ## BB#0: ## %entry 1354; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 1355; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [3,4] 1356; X64-AVX1-NEXT: vpaddq %xmm4, %xmm3, %xmm3 1357; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2] 1358; X64-AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 1359; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 1360; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [1,2,3,4] 1361; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 1362; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm6 1363; X64-AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 1364; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 1365; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 1366; X64-AVX1-NEXT: vpaddq %xmm4, %xmm6, %xmm4 1367; X64-AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 1368; X64-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 1369; X64-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 1370; X64-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 1371; X64-AVX1-NEXT: vmovups %ymm0, {{.*}}(%rip) 1372; X64-AVX1-NEXT: vmovups %ymm2, _gb4+{{.*}}(%rip) 1373; X64-AVX1-NEXT: vmovups %ymm1, {{.*}}(%rip) 1374; X64-AVX1-NEXT: vzeroupper 1375; X64-AVX1-NEXT: retq 1376; 1377; X64-AVX2-LABEL: fallback_broadcast_v4i64_to_v8i64: 1378; X64-AVX2: ## BB#0: ## %entry 1379; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4] 1380; X64-AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 1381; X64-AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 1382; X64-AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 1383; X64-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 1384; X64-AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 1385; X64-AVX2-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 1386; X64-AVX2-NEXT: vmovdqu %ymm2, _gb4+{{.*}}(%rip) 1387; X64-AVX2-NEXT: vmovdqu %ymm1, {{.*}}(%rip) 1388; X64-AVX2-NEXT: vzeroupper 1389; X64-AVX2-NEXT: retq 1390; 1391; X64-AVX512-LABEL: fallback_broadcast_v4i64_to_v8i64: 1392; X64-AVX512: ## BB#0: ## %entry 1393; X64-AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,3,4] 1394; X64-AVX512-NEXT: vpaddq %ymm2, %ymm0, %ymm0 1395; X64-AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 1396; X64-AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 1397; X64-AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 1398; X64-AVX512-NEXT: vmovdqu %ymm0, {{.*}}(%rip) 1399; X64-AVX512-NEXT: vmovdqu64 %zmm1, {{.*}}(%rip) 1400; X64-AVX512-NEXT: retq 1401entry: 1402 %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4> 1403 %1 = add <8 x i64> %b, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 1404 %2 = and <8 x i64> %1, <i64 1, i64 2, i64 3, i64 4, i64 1, i64 2, i64 3, i64 4> 1405 store <4 x i64> %0, <4 x i64>* @ga4, align 8 1406 store <8 x i64> %2, <8 x i64>* @gb4, align 8 1407 ret void 1408} 1409 1410 1411@ga2 = global <4 x double> zeroinitializer, align 8 1412@gb2 = global <8 x double> zeroinitializer, align 8 1413 1414define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b) { 1415; X32-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 1416; X32-AVX: ## BB#0: ## %entry 1417; X32-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1418; X32-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 1419; X32-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 1420; X32-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1421; X32-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 1422; X32-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 1423; X32-AVX-NEXT: vmovupd %ymm0, _ga2 1424; X32-AVX-NEXT: vmovupd %ymm2, _gb2+32 1425; X32-AVX-NEXT: vmovupd %ymm1, _gb2 1426; X32-AVX-NEXT: vzeroupper 1427; X32-AVX-NEXT: retl 1428; 1429; X32-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 1430; X32-AVX512: ## BB#0: ## %entry 1431; X32-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1432; X32-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1433; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 1434; X32-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 1435; X32-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 1436; X32-AVX512-NEXT: vmovupd %ymm0, _ga2 1437; X32-AVX512-NEXT: vmovupd %zmm1, _gb2 1438; X32-AVX512-NEXT: retl 1439; 1440; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64: 1441; X64-AVX: ## BB#0: ## %entry 1442; X64-AVX-NEXT: vmovapd {{.*#+}} ymm3 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1443; X64-AVX-NEXT: vaddpd %ymm3, %ymm0, %ymm0 1444; X64-AVX-NEXT: vaddpd %ymm3, %ymm2, %ymm2 1445; X64-AVX-NEXT: vaddpd %ymm3, %ymm1, %ymm1 1446; X64-AVX-NEXT: vdivpd %ymm3, %ymm1, %ymm1 1447; X64-AVX-NEXT: vdivpd %ymm3, %ymm2, %ymm2 1448; X64-AVX-NEXT: vmovupd %ymm0, {{.*}}(%rip) 1449; X64-AVX-NEXT: vmovupd %ymm2, _gb2+{{.*}}(%rip) 1450; X64-AVX-NEXT: vmovupd %ymm1, {{.*}}(%rip) 1451; X64-AVX-NEXT: vzeroupper 1452; X64-AVX-NEXT: retq 1453; 1454; X64-AVX512-LABEL: fallback_broadcast_v4f64_to_v8f64: 1455; X64-AVX512: ## BB#0: ## %entry 1456; X64-AVX512-NEXT: vmovapd {{.*#+}} ymm2 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00] 1457; X64-AVX512-NEXT: vaddpd %ymm2, %ymm0, %ymm0 1458; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm2 1459; X64-AVX512-NEXT: vaddpd %zmm2, %zmm1, %zmm1 1460; X64-AVX512-NEXT: vdivpd %zmm2, %zmm1, %zmm1 1461; X64-AVX512-NEXT: vmovupd %ymm0, {{.*}}(%rip) 1462; X64-AVX512-NEXT: vmovupd %zmm1, {{.*}}(%rip) 1463; X64-AVX512-NEXT: retq 1464entry: 1465 %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0> 1466 %1 = fadd <8 x double> %b, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 1467 %2 = fdiv <8 x double> %1, <double 1.0, double 2.0, double 3.0, double 4.0, double 1.0, double 2.0, double 3.0, double 4.0> 1468 store <4 x double> %0, <4 x double>* @ga2, align 8 1469 store <8 x double> %2, <8 x double>* @gb2, align 8 1470 ret void 1471} 1472