1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=SSE32 3; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+sse2 < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=SSE64 4; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVXONLY32 5; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVXONLY64 6; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=KNL32 7; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=KNL64 8; RUN: llc -mtriple=x86_64-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512vl,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL32 --check-prefix=AVX32 --check-prefix=AVX51232 --check-prefix=SKX32 9; RUN: llc -mtriple=i686-none-linux -fast-isel -fast-isel-abort=1 -mattr=+avx512f,+avx512dq,+avx512bw < %s | FileCheck %s --check-prefix=ALL64 --check-prefix=AVX64 --check-prefix=AVX51264 --check-prefix=SKX64 10 11define i32 @test_store_32(i32* nocapture %addr, i32 %value) { 12; ALL32-LABEL: test_store_32: 13; ALL32: # BB#0: # %entry 14; ALL32-NEXT: movl %esi, (%rdi) 15; ALL32-NEXT: movl %esi, %eax 16; ALL32-NEXT: retq 17; 18; ALL64-LABEL: test_store_32: 19; ALL64: # BB#0: # %entry 20; ALL64-NEXT: movl {{[0-9]+}}(%esp), %eax 21; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 22; ALL64-NEXT: movl %eax, (%ecx) 23; ALL64-NEXT: retl 24entry: 25 store i32 %value, i32* %addr, align 1 26 ret i32 %value 27} 28 29define i16 @test_store_16(i16* nocapture %addr, i16 %value) { 30; ALL32-LABEL: test_store_16: 31; ALL32: # BB#0: # %entry 32; ALL32-NEXT: movw %si, (%rdi) 33; ALL32-NEXT: movl %esi, %eax 34; ALL32-NEXT: retq 35; 36; ALL64-LABEL: test_store_16: 37; ALL64: # BB#0: # %entry 38; ALL64-NEXT: movzwl {{[0-9]+}}(%esp), %eax 39; ALL64-NEXT: movl {{[0-9]+}}(%esp), %ecx 40; ALL64-NEXT: movw %ax, (%ecx) 41; ALL64-NEXT: retl 42entry: 43 store i16 %value, i16* %addr, align 1 44 ret i16 %value 45} 46 47define <4 x i32> @test_store_4xi32(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 48; SSE32-LABEL: test_store_4xi32: 49; SSE32: # BB#0: 50; SSE32-NEXT: paddd %xmm1, %xmm0 51; SSE32-NEXT: movdqu %xmm0, (%rdi) 52; SSE32-NEXT: retq 53; 54; SSE64-LABEL: test_store_4xi32: 55; SSE64: # BB#0: 56; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 57; SSE64-NEXT: paddd %xmm1, %xmm0 58; SSE64-NEXT: movdqu %xmm0, (%eax) 59; SSE64-NEXT: retl 60; 61; AVXONLY32-LABEL: test_store_4xi32: 62; AVXONLY32: # BB#0: 63; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 64; AVXONLY32-NEXT: vmovdqu %xmm0, (%rdi) 65; AVXONLY32-NEXT: retq 66; 67; AVX64-LABEL: test_store_4xi32: 68; AVX64: # BB#0: 69; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 70; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 71; AVX64-NEXT: vmovdqu %xmm0, (%eax) 72; AVX64-NEXT: retl 73; 74; KNL32-LABEL: test_store_4xi32: 75; KNL32: # BB#0: 76; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 77; KNL32-NEXT: vmovdqu %xmm0, (%rdi) 78; KNL32-NEXT: retq 79; 80; SKX32-LABEL: test_store_4xi32: 81; SKX32: # BB#0: 82; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 83; SKX32-NEXT: vmovdqu %xmm0, (%rdi) 84; SKX32-NEXT: retq 85 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 86 store <4 x i32> %foo, <4 x i32>* %addr, align 1 87 ret <4 x i32> %foo 88} 89 90define <4 x i32> @test_store_4xi32_aligned(<4 x i32>* nocapture %addr, <4 x i32> %value, <4 x i32> %value2) { 91; SSE32-LABEL: test_store_4xi32_aligned: 92; SSE32: # BB#0: 93; SSE32-NEXT: paddd %xmm1, %xmm0 94; SSE32-NEXT: movdqa %xmm0, (%rdi) 95; SSE32-NEXT: retq 96; 97; SSE64-LABEL: test_store_4xi32_aligned: 98; SSE64: # BB#0: 99; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 100; SSE64-NEXT: paddd %xmm1, %xmm0 101; SSE64-NEXT: movdqa %xmm0, (%eax) 102; SSE64-NEXT: retl 103; 104; AVXONLY32-LABEL: test_store_4xi32_aligned: 105; AVXONLY32: # BB#0: 106; AVXONLY32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 107; AVXONLY32-NEXT: vmovdqa %xmm0, (%rdi) 108; AVXONLY32-NEXT: retq 109; 110; AVX64-LABEL: test_store_4xi32_aligned: 111; AVX64: # BB#0: 112; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 113; AVX64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 114; AVX64-NEXT: vmovdqa %xmm0, (%eax) 115; AVX64-NEXT: retl 116; 117; KNL32-LABEL: test_store_4xi32_aligned: 118; KNL32: # BB#0: 119; KNL32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 120; KNL32-NEXT: vmovdqa %xmm0, (%rdi) 121; KNL32-NEXT: retq 122; 123; SKX32-LABEL: test_store_4xi32_aligned: 124; SKX32: # BB#0: 125; SKX32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 126; SKX32-NEXT: vmovdqa %xmm0, (%rdi) 127; SKX32-NEXT: retq 128 %foo = add <4 x i32> %value, %value2 ; to force integer type on store 129 store <4 x i32> %foo, <4 x i32>* %addr, align 16 130 ret <4 x i32> %foo 131} 132 133define <4 x float> @test_store_4xf32(<4 x float>* nocapture %addr, <4 x float> %value) { 134; SSE32-LABEL: test_store_4xf32: 135; SSE32: # BB#0: 136; SSE32-NEXT: movups %xmm0, (%rdi) 137; SSE32-NEXT: retq 138; 139; SSE64-LABEL: test_store_4xf32: 140; SSE64: # BB#0: 141; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 142; SSE64-NEXT: movups %xmm0, (%eax) 143; SSE64-NEXT: retl 144; 145; AVX32-LABEL: test_store_4xf32: 146; AVX32: # BB#0: 147; AVX32-NEXT: vmovups %xmm0, (%rdi) 148; AVX32-NEXT: retq 149; 150; AVX64-LABEL: test_store_4xf32: 151; AVX64: # BB#0: 152; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 153; AVX64-NEXT: vmovups %xmm0, (%eax) 154; AVX64-NEXT: retl 155 store <4 x float> %value, <4 x float>* %addr, align 1 156 ret <4 x float> %value 157} 158 159define <4 x float> @test_store_4xf32_aligned(<4 x float>* nocapture %addr, <4 x float> %value) { 160; SSE32-LABEL: test_store_4xf32_aligned: 161; SSE32: # BB#0: 162; SSE32-NEXT: movaps %xmm0, (%rdi) 163; SSE32-NEXT: retq 164; 165; SSE64-LABEL: test_store_4xf32_aligned: 166; SSE64: # BB#0: 167; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 168; SSE64-NEXT: movaps %xmm0, (%eax) 169; SSE64-NEXT: retl 170; 171; AVX32-LABEL: test_store_4xf32_aligned: 172; AVX32: # BB#0: 173; AVX32-NEXT: vmovaps %xmm0, (%rdi) 174; AVX32-NEXT: retq 175; 176; AVX64-LABEL: test_store_4xf32_aligned: 177; AVX64: # BB#0: 178; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 179; AVX64-NEXT: vmovaps %xmm0, (%eax) 180; AVX64-NEXT: retl 181 store <4 x float> %value, <4 x float>* %addr, align 16 182 ret <4 x float> %value 183} 184 185define <2 x double> @test_store_2xf64(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 186; SSE32-LABEL: test_store_2xf64: 187; SSE32: # BB#0: 188; SSE32-NEXT: addpd %xmm1, %xmm0 189; SSE32-NEXT: movupd %xmm0, (%rdi) 190; SSE32-NEXT: retq 191; 192; SSE64-LABEL: test_store_2xf64: 193; SSE64: # BB#0: 194; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 195; SSE64-NEXT: addpd %xmm1, %xmm0 196; SSE64-NEXT: movupd %xmm0, (%eax) 197; SSE64-NEXT: retl 198; 199; AVX32-LABEL: test_store_2xf64: 200; AVX32: # BB#0: 201; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 202; AVX32-NEXT: vmovupd %xmm0, (%rdi) 203; AVX32-NEXT: retq 204; 205; AVX64-LABEL: test_store_2xf64: 206; AVX64: # BB#0: 207; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 208; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 209; AVX64-NEXT: vmovupd %xmm0, (%eax) 210; AVX64-NEXT: retl 211 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 212 store <2 x double> %foo, <2 x double>* %addr, align 1 213 ret <2 x double> %foo 214} 215 216define <2 x double> @test_store_2xf64_aligned(<2 x double>* nocapture %addr, <2 x double> %value, <2 x double> %value2) { 217; SSE32-LABEL: test_store_2xf64_aligned: 218; SSE32: # BB#0: 219; SSE32-NEXT: addpd %xmm1, %xmm0 220; SSE32-NEXT: movapd %xmm0, (%rdi) 221; SSE32-NEXT: retq 222; 223; SSE64-LABEL: test_store_2xf64_aligned: 224; SSE64: # BB#0: 225; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 226; SSE64-NEXT: addpd %xmm1, %xmm0 227; SSE64-NEXT: movapd %xmm0, (%eax) 228; SSE64-NEXT: retl 229; 230; AVX32-LABEL: test_store_2xf64_aligned: 231; AVX32: # BB#0: 232; AVX32-NEXT: vaddpd %xmm1, %xmm0, %xmm0 233; AVX32-NEXT: vmovapd %xmm0, (%rdi) 234; AVX32-NEXT: retq 235; 236; AVX64-LABEL: test_store_2xf64_aligned: 237; AVX64: # BB#0: 238; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 239; AVX64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 240; AVX64-NEXT: vmovapd %xmm0, (%eax) 241; AVX64-NEXT: retl 242 %foo = fadd <2 x double> %value, %value2 ; to force dobule type on store 243 store <2 x double> %foo, <2 x double>* %addr, align 16 244 ret <2 x double> %foo 245} 246 247define <8 x i32> @test_store_8xi32(<8 x i32>* nocapture %addr, <8 x i32> %value) { 248; SSE32-LABEL: test_store_8xi32: 249; SSE32: # BB#0: 250; SSE32-NEXT: movups %xmm0, (%rdi) 251; SSE32-NEXT: movups %xmm1, 16(%rdi) 252; SSE32-NEXT: retq 253; 254; SSE64-LABEL: test_store_8xi32: 255; SSE64: # BB#0: 256; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 257; SSE64-NEXT: movups %xmm0, (%eax) 258; SSE64-NEXT: movups %xmm1, 16(%eax) 259; SSE64-NEXT: retl 260; 261; AVX32-LABEL: test_store_8xi32: 262; AVX32: # BB#0: 263; AVX32-NEXT: vmovups %ymm0, (%rdi) 264; AVX32-NEXT: retq 265; 266; AVX64-LABEL: test_store_8xi32: 267; AVX64: # BB#0: 268; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 269; AVX64-NEXT: vmovups %ymm0, (%eax) 270; AVX64-NEXT: retl 271 store <8 x i32> %value, <8 x i32>* %addr, align 1 272 ret <8 x i32> %value 273} 274 275define <8 x i32> @test_store_8xi32_aligned(<8 x i32>* nocapture %addr, <8 x i32> %value) { 276; SSE32-LABEL: test_store_8xi32_aligned: 277; SSE32: # BB#0: 278; SSE32-NEXT: movaps %xmm0, (%rdi) 279; SSE32-NEXT: movaps %xmm1, 16(%rdi) 280; SSE32-NEXT: retq 281; 282; SSE64-LABEL: test_store_8xi32_aligned: 283; SSE64: # BB#0: 284; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 285; SSE64-NEXT: movaps %xmm0, (%eax) 286; SSE64-NEXT: movaps %xmm1, 16(%eax) 287; SSE64-NEXT: retl 288; 289; AVX32-LABEL: test_store_8xi32_aligned: 290; AVX32: # BB#0: 291; AVX32-NEXT: vmovaps %ymm0, (%rdi) 292; AVX32-NEXT: retq 293; 294; AVX64-LABEL: test_store_8xi32_aligned: 295; AVX64: # BB#0: 296; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 297; AVX64-NEXT: vmovaps %ymm0, (%eax) 298; AVX64-NEXT: retl 299 store <8 x i32> %value, <8 x i32>* %addr, align 32 300 ret <8 x i32> %value 301} 302 303define <8 x float> @test_store_8xf32(<8 x float>* nocapture %addr, <8 x float> %value) { 304; SSE32-LABEL: test_store_8xf32: 305; SSE32: # BB#0: 306; SSE32-NEXT: movups %xmm0, (%rdi) 307; SSE32-NEXT: movups %xmm1, 16(%rdi) 308; SSE32-NEXT: retq 309; 310; SSE64-LABEL: test_store_8xf32: 311; SSE64: # BB#0: 312; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 313; SSE64-NEXT: movups %xmm0, (%eax) 314; SSE64-NEXT: movups %xmm1, 16(%eax) 315; SSE64-NEXT: retl 316; 317; AVX32-LABEL: test_store_8xf32: 318; AVX32: # BB#0: 319; AVX32-NEXT: vmovups %ymm0, (%rdi) 320; AVX32-NEXT: retq 321; 322; AVX64-LABEL: test_store_8xf32: 323; AVX64: # BB#0: 324; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 325; AVX64-NEXT: vmovups %ymm0, (%eax) 326; AVX64-NEXT: retl 327 store <8 x float> %value, <8 x float>* %addr, align 1 328 ret <8 x float> %value 329} 330 331define <8 x float> @test_store_8xf32_aligned(<8 x float>* nocapture %addr, <8 x float> %value) { 332; SSE32-LABEL: test_store_8xf32_aligned: 333; SSE32: # BB#0: 334; SSE32-NEXT: movaps %xmm0, (%rdi) 335; SSE32-NEXT: movaps %xmm1, 16(%rdi) 336; SSE32-NEXT: retq 337; 338; SSE64-LABEL: test_store_8xf32_aligned: 339; SSE64: # BB#0: 340; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 341; SSE64-NEXT: movaps %xmm0, (%eax) 342; SSE64-NEXT: movaps %xmm1, 16(%eax) 343; SSE64-NEXT: retl 344; 345; AVX32-LABEL: test_store_8xf32_aligned: 346; AVX32: # BB#0: 347; AVX32-NEXT: vmovaps %ymm0, (%rdi) 348; AVX32-NEXT: retq 349; 350; AVX64-LABEL: test_store_8xf32_aligned: 351; AVX64: # BB#0: 352; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 353; AVX64-NEXT: vmovaps %ymm0, (%eax) 354; AVX64-NEXT: retl 355 store <8 x float> %value, <8 x float>* %addr, align 32 356 ret <8 x float> %value 357} 358 359define <4 x double> @test_store_4xf64(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 360; SSE32-LABEL: test_store_4xf64: 361; SSE32: # BB#0: 362; SSE32-NEXT: addpd %xmm3, %xmm1 363; SSE32-NEXT: addpd %xmm2, %xmm0 364; SSE32-NEXT: movupd %xmm0, (%rdi) 365; SSE32-NEXT: movupd %xmm1, 16(%rdi) 366; SSE32-NEXT: retq 367; 368; SSE64-LABEL: test_store_4xf64: 369; SSE64: # BB#0: 370; SSE64-NEXT: subl $12, %esp 371; SSE64-NEXT: .Lcfi0: 372; SSE64-NEXT: .cfi_def_cfa_offset 16 373; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 374; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 375; SSE64-NEXT: addpd %xmm2, %xmm0 376; SSE64-NEXT: movupd %xmm0, (%eax) 377; SSE64-NEXT: movupd %xmm1, 16(%eax) 378; SSE64-NEXT: addl $12, %esp 379; SSE64-NEXT: retl 380; 381; AVX32-LABEL: test_store_4xf64: 382; AVX32: # BB#0: 383; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 384; AVX32-NEXT: vmovupd %ymm0, (%rdi) 385; AVX32-NEXT: retq 386; 387; AVX64-LABEL: test_store_4xf64: 388; AVX64: # BB#0: 389; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 390; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 391; AVX64-NEXT: vmovupd %ymm0, (%eax) 392; AVX64-NEXT: retl 393 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 394 store <4 x double> %foo, <4 x double>* %addr, align 1 395 ret <4 x double> %foo 396} 397 398define <4 x double> @test_store_4xf64_aligned(<4 x double>* nocapture %addr, <4 x double> %value, <4 x double> %value2) { 399; SSE32-LABEL: test_store_4xf64_aligned: 400; SSE32: # BB#0: 401; SSE32-NEXT: addpd %xmm3, %xmm1 402; SSE32-NEXT: addpd %xmm2, %xmm0 403; SSE32-NEXT: movapd %xmm0, (%rdi) 404; SSE32-NEXT: movapd %xmm1, 16(%rdi) 405; SSE32-NEXT: retq 406; 407; SSE64-LABEL: test_store_4xf64_aligned: 408; SSE64: # BB#0: 409; SSE64-NEXT: subl $12, %esp 410; SSE64-NEXT: .Lcfi1: 411; SSE64-NEXT: .cfi_def_cfa_offset 16 412; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 413; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 414; SSE64-NEXT: addpd %xmm2, %xmm0 415; SSE64-NEXT: movapd %xmm0, (%eax) 416; SSE64-NEXT: movapd %xmm1, 16(%eax) 417; SSE64-NEXT: addl $12, %esp 418; SSE64-NEXT: retl 419; 420; AVX32-LABEL: test_store_4xf64_aligned: 421; AVX32: # BB#0: 422; AVX32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 423; AVX32-NEXT: vmovapd %ymm0, (%rdi) 424; AVX32-NEXT: retq 425; 426; AVX64-LABEL: test_store_4xf64_aligned: 427; AVX64: # BB#0: 428; AVX64-NEXT: movl {{[0-9]+}}(%esp), %eax 429; AVX64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 430; AVX64-NEXT: vmovapd %ymm0, (%eax) 431; AVX64-NEXT: retl 432 %foo = fadd <4 x double> %value, %value2 ; to force dobule type on store 433 store <4 x double> %foo, <4 x double>* %addr, align 32 434 ret <4 x double> %foo 435} 436 437define <16 x i32> @test_store_16xi32(<16 x i32>* nocapture %addr, <16 x i32> %value) { 438; SSE32-LABEL: test_store_16xi32: 439; SSE32: # BB#0: 440; SSE32-NEXT: movups %xmm0, (%rdi) 441; SSE32-NEXT: movups %xmm1, 16(%rdi) 442; SSE32-NEXT: movups %xmm2, 32(%rdi) 443; SSE32-NEXT: movups %xmm3, 48(%rdi) 444; SSE32-NEXT: retq 445; 446; SSE64-LABEL: test_store_16xi32: 447; SSE64: # BB#0: 448; SSE64-NEXT: subl $12, %esp 449; SSE64-NEXT: .Lcfi2: 450; SSE64-NEXT: .cfi_def_cfa_offset 16 451; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 452; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 453; SSE64-NEXT: movups %xmm0, (%eax) 454; SSE64-NEXT: movups %xmm1, 16(%eax) 455; SSE64-NEXT: movups %xmm2, 32(%eax) 456; SSE64-NEXT: movups %xmm3, 48(%eax) 457; SSE64-NEXT: addl $12, %esp 458; SSE64-NEXT: retl 459; 460; AVXONLY32-LABEL: test_store_16xi32: 461; AVXONLY32: # BB#0: 462; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 463; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 464; AVXONLY32-NEXT: retq 465; 466; AVXONLY64-LABEL: test_store_16xi32: 467; AVXONLY64: # BB#0: 468; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 469; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 470; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 471; AVXONLY64-NEXT: retl 472; 473; AVX51232-LABEL: test_store_16xi32: 474; AVX51232: # BB#0: 475; AVX51232-NEXT: vmovups %zmm0, (%rdi) 476; AVX51232-NEXT: retq 477; 478; AVX51264-LABEL: test_store_16xi32: 479; AVX51264: # BB#0: 480; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 481; AVX51264-NEXT: vmovups %zmm0, (%eax) 482; AVX51264-NEXT: retl 483 store <16 x i32> %value, <16 x i32>* %addr, align 1 484 ret <16 x i32> %value 485} 486 487define <16 x i32> @test_store_16xi32_aligned(<16 x i32>* nocapture %addr, <16 x i32> %value) { 488; SSE32-LABEL: test_store_16xi32_aligned: 489; SSE32: # BB#0: 490; SSE32-NEXT: movaps %xmm0, (%rdi) 491; SSE32-NEXT: movaps %xmm1, 16(%rdi) 492; SSE32-NEXT: movaps %xmm2, 32(%rdi) 493; SSE32-NEXT: movaps %xmm3, 48(%rdi) 494; SSE32-NEXT: retq 495; 496; SSE64-LABEL: test_store_16xi32_aligned: 497; SSE64: # BB#0: 498; SSE64-NEXT: subl $12, %esp 499; SSE64-NEXT: .Lcfi3: 500; SSE64-NEXT: .cfi_def_cfa_offset 16 501; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 502; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 503; SSE64-NEXT: movaps %xmm0, (%eax) 504; SSE64-NEXT: movaps %xmm1, 16(%eax) 505; SSE64-NEXT: movaps %xmm2, 32(%eax) 506; SSE64-NEXT: movaps %xmm3, 48(%eax) 507; SSE64-NEXT: addl $12, %esp 508; SSE64-NEXT: retl 509; 510; AVXONLY32-LABEL: test_store_16xi32_aligned: 511; AVXONLY32: # BB#0: 512; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 513; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 514; AVXONLY32-NEXT: retq 515; 516; AVXONLY64-LABEL: test_store_16xi32_aligned: 517; AVXONLY64: # BB#0: 518; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 519; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 520; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 521; AVXONLY64-NEXT: retl 522; 523; AVX51232-LABEL: test_store_16xi32_aligned: 524; AVX51232: # BB#0: 525; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 526; AVX51232-NEXT: retq 527; 528; AVX51264-LABEL: test_store_16xi32_aligned: 529; AVX51264: # BB#0: 530; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 531; AVX51264-NEXT: vmovaps %zmm0, (%eax) 532; AVX51264-NEXT: retl 533 store <16 x i32> %value, <16 x i32>* %addr, align 64 534 ret <16 x i32> %value 535} 536 537define <16 x float> @test_store_16xf32(<16 x float>* nocapture %addr, <16 x float> %value) { 538; SSE32-LABEL: test_store_16xf32: 539; SSE32: # BB#0: 540; SSE32-NEXT: movups %xmm0, (%rdi) 541; SSE32-NEXT: movups %xmm1, 16(%rdi) 542; SSE32-NEXT: movups %xmm2, 32(%rdi) 543; SSE32-NEXT: movups %xmm3, 48(%rdi) 544; SSE32-NEXT: retq 545; 546; SSE64-LABEL: test_store_16xf32: 547; SSE64: # BB#0: 548; SSE64-NEXT: subl $12, %esp 549; SSE64-NEXT: .Lcfi4: 550; SSE64-NEXT: .cfi_def_cfa_offset 16 551; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 552; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 553; SSE64-NEXT: movups %xmm0, (%eax) 554; SSE64-NEXT: movups %xmm1, 16(%eax) 555; SSE64-NEXT: movups %xmm2, 32(%eax) 556; SSE64-NEXT: movups %xmm3, 48(%eax) 557; SSE64-NEXT: addl $12, %esp 558; SSE64-NEXT: retl 559; 560; AVXONLY32-LABEL: test_store_16xf32: 561; AVXONLY32: # BB#0: 562; AVXONLY32-NEXT: vmovups %ymm0, (%rdi) 563; AVXONLY32-NEXT: vmovups %ymm1, 32(%rdi) 564; AVXONLY32-NEXT: retq 565; 566; AVXONLY64-LABEL: test_store_16xf32: 567; AVXONLY64: # BB#0: 568; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 569; AVXONLY64-NEXT: vmovups %ymm0, (%eax) 570; AVXONLY64-NEXT: vmovups %ymm1, 32(%eax) 571; AVXONLY64-NEXT: retl 572; 573; AVX51232-LABEL: test_store_16xf32: 574; AVX51232: # BB#0: 575; AVX51232-NEXT: vmovups %zmm0, (%rdi) 576; AVX51232-NEXT: retq 577; 578; AVX51264-LABEL: test_store_16xf32: 579; AVX51264: # BB#0: 580; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 581; AVX51264-NEXT: vmovups %zmm0, (%eax) 582; AVX51264-NEXT: retl 583 store <16 x float> %value, <16 x float>* %addr, align 1 584 ret <16 x float> %value 585} 586 587define <16 x float> @test_store_16xf32_aligned(<16 x float>* nocapture %addr, <16 x float> %value) { 588; SSE32-LABEL: test_store_16xf32_aligned: 589; SSE32: # BB#0: 590; SSE32-NEXT: movaps %xmm0, (%rdi) 591; SSE32-NEXT: movaps %xmm1, 16(%rdi) 592; SSE32-NEXT: movaps %xmm2, 32(%rdi) 593; SSE32-NEXT: movaps %xmm3, 48(%rdi) 594; SSE32-NEXT: retq 595; 596; SSE64-LABEL: test_store_16xf32_aligned: 597; SSE64: # BB#0: 598; SSE64-NEXT: subl $12, %esp 599; SSE64-NEXT: .Lcfi5: 600; SSE64-NEXT: .cfi_def_cfa_offset 16 601; SSE64-NEXT: movaps {{[0-9]+}}(%esp), %xmm3 602; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 603; SSE64-NEXT: movaps %xmm0, (%eax) 604; SSE64-NEXT: movaps %xmm1, 16(%eax) 605; SSE64-NEXT: movaps %xmm2, 32(%eax) 606; SSE64-NEXT: movaps %xmm3, 48(%eax) 607; SSE64-NEXT: addl $12, %esp 608; SSE64-NEXT: retl 609; 610; AVXONLY32-LABEL: test_store_16xf32_aligned: 611; AVXONLY32: # BB#0: 612; AVXONLY32-NEXT: vmovaps %ymm0, (%rdi) 613; AVXONLY32-NEXT: vmovaps %ymm1, 32(%rdi) 614; AVXONLY32-NEXT: retq 615; 616; AVXONLY64-LABEL: test_store_16xf32_aligned: 617; AVXONLY64: # BB#0: 618; AVXONLY64-NEXT: movl {{[0-9]+}}(%esp), %eax 619; AVXONLY64-NEXT: vmovaps %ymm0, (%eax) 620; AVXONLY64-NEXT: vmovaps %ymm1, 32(%eax) 621; AVXONLY64-NEXT: retl 622; 623; AVX51232-LABEL: test_store_16xf32_aligned: 624; AVX51232: # BB#0: 625; AVX51232-NEXT: vmovaps %zmm0, (%rdi) 626; AVX51232-NEXT: retq 627; 628; AVX51264-LABEL: test_store_16xf32_aligned: 629; AVX51264: # BB#0: 630; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 631; AVX51264-NEXT: vmovaps %zmm0, (%eax) 632; AVX51264-NEXT: retl 633 store <16 x float> %value, <16 x float>* %addr, align 64 634 ret <16 x float> %value 635} 636 637define <8 x double> @test_store_8xf64(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 638; SSE32-LABEL: test_store_8xf64: 639; SSE32: # BB#0: 640; SSE32-NEXT: addpd %xmm7, %xmm3 641; SSE32-NEXT: addpd %xmm6, %xmm2 642; SSE32-NEXT: addpd %xmm5, %xmm1 643; SSE32-NEXT: addpd %xmm4, %xmm0 644; SSE32-NEXT: movupd %xmm0, (%rdi) 645; SSE32-NEXT: movupd %xmm1, 16(%rdi) 646; SSE32-NEXT: movupd %xmm2, 32(%rdi) 647; SSE32-NEXT: movupd %xmm3, 48(%rdi) 648; SSE32-NEXT: retq 649; 650; SSE64-LABEL: test_store_8xf64: 651; SSE64: # BB#0: 652; SSE64-NEXT: subl $12, %esp 653; SSE64-NEXT: .Lcfi6: 654; SSE64-NEXT: .cfi_def_cfa_offset 16 655; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 656; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 657; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 658; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 659; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 660; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 661; SSE64-NEXT: movupd %xmm0, (%eax) 662; SSE64-NEXT: movupd %xmm1, 16(%eax) 663; SSE64-NEXT: movupd %xmm2, 32(%eax) 664; SSE64-NEXT: movupd %xmm3, 48(%eax) 665; SSE64-NEXT: addl $12, %esp 666; SSE64-NEXT: retl 667; 668; AVXONLY32-LABEL: test_store_8xf64: 669; AVXONLY32: # BB#0: 670; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 671; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 672; AVXONLY32-NEXT: vmovupd %ymm0, (%rdi) 673; AVXONLY32-NEXT: vmovupd %ymm1, 32(%rdi) 674; AVXONLY32-NEXT: retq 675; 676; AVXONLY64-LABEL: test_store_8xf64: 677; AVXONLY64: # BB#0: 678; AVXONLY64-NEXT: pushl %ebp 679; AVXONLY64-NEXT: .Lcfi0: 680; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 681; AVXONLY64-NEXT: .Lcfi1: 682; AVXONLY64-NEXT: .cfi_offset %ebp, -8 683; AVXONLY64-NEXT: movl %esp, %ebp 684; AVXONLY64-NEXT: .Lcfi2: 685; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 686; AVXONLY64-NEXT: andl $-32, %esp 687; AVXONLY64-NEXT: subl $32, %esp 688; AVXONLY64-NEXT: movl 8(%ebp), %eax 689; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 690; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 691; AVXONLY64-NEXT: vmovupd %ymm0, (%eax) 692; AVXONLY64-NEXT: vmovupd %ymm1, 32(%eax) 693; AVXONLY64-NEXT: movl %ebp, %esp 694; AVXONLY64-NEXT: popl %ebp 695; AVXONLY64-NEXT: retl 696; 697; AVX51232-LABEL: test_store_8xf64: 698; AVX51232: # BB#0: 699; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 700; AVX51232-NEXT: vmovupd %zmm0, (%rdi) 701; AVX51232-NEXT: retq 702; 703; AVX51264-LABEL: test_store_8xf64: 704; AVX51264: # BB#0: 705; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 706; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 707; AVX51264-NEXT: vmovupd %zmm0, (%eax) 708; AVX51264-NEXT: retl 709 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 710 store <8 x double> %foo, <8 x double>* %addr, align 1 711 ret <8 x double> %foo 712} 713 714define <8 x double> @test_store_8xf64_aligned(<8 x double>* nocapture %addr, <8 x double> %value, <8 x double> %value2) { 715; SSE32-LABEL: test_store_8xf64_aligned: 716; SSE32: # BB#0: 717; SSE32-NEXT: addpd %xmm7, %xmm3 718; SSE32-NEXT: addpd %xmm6, %xmm2 719; SSE32-NEXT: addpd %xmm5, %xmm1 720; SSE32-NEXT: addpd %xmm4, %xmm0 721; SSE32-NEXT: movapd %xmm0, (%rdi) 722; SSE32-NEXT: movapd %xmm1, 16(%rdi) 723; SSE32-NEXT: movapd %xmm2, 32(%rdi) 724; SSE32-NEXT: movapd %xmm3, 48(%rdi) 725; SSE32-NEXT: retq 726; 727; SSE64-LABEL: test_store_8xf64_aligned: 728; SSE64: # BB#0: 729; SSE64-NEXT: subl $12, %esp 730; SSE64-NEXT: .Lcfi7: 731; SSE64-NEXT: .cfi_def_cfa_offset 16 732; SSE64-NEXT: movapd {{[0-9]+}}(%esp), %xmm3 733; SSE64-NEXT: movl {{[0-9]+}}(%esp), %eax 734; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm3 735; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm2 736; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm1 737; SSE64-NEXT: addpd {{[0-9]+}}(%esp), %xmm0 738; SSE64-NEXT: movapd %xmm0, (%eax) 739; SSE64-NEXT: movapd %xmm1, 16(%eax) 740; SSE64-NEXT: movapd %xmm2, 32(%eax) 741; SSE64-NEXT: movapd %xmm3, 48(%eax) 742; SSE64-NEXT: addl $12, %esp 743; SSE64-NEXT: retl 744; 745; AVXONLY32-LABEL: test_store_8xf64_aligned: 746; AVXONLY32: # BB#0: 747; AVXONLY32-NEXT: vaddpd %ymm3, %ymm1, %ymm1 748; AVXONLY32-NEXT: vaddpd %ymm2, %ymm0, %ymm0 749; AVXONLY32-NEXT: vmovapd %ymm0, (%rdi) 750; AVXONLY32-NEXT: vmovapd %ymm1, 32(%rdi) 751; AVXONLY32-NEXT: retq 752; 753; AVXONLY64-LABEL: test_store_8xf64_aligned: 754; AVXONLY64: # BB#0: 755; AVXONLY64-NEXT: pushl %ebp 756; AVXONLY64-NEXT: .Lcfi3: 757; AVXONLY64-NEXT: .cfi_def_cfa_offset 8 758; AVXONLY64-NEXT: .Lcfi4: 759; AVXONLY64-NEXT: .cfi_offset %ebp, -8 760; AVXONLY64-NEXT: movl %esp, %ebp 761; AVXONLY64-NEXT: .Lcfi5: 762; AVXONLY64-NEXT: .cfi_def_cfa_register %ebp 763; AVXONLY64-NEXT: andl $-32, %esp 764; AVXONLY64-NEXT: subl $32, %esp 765; AVXONLY64-NEXT: movl 8(%ebp), %eax 766; AVXONLY64-NEXT: vaddpd 40(%ebp), %ymm1, %ymm1 767; AVXONLY64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 768; AVXONLY64-NEXT: vmovapd %ymm0, (%eax) 769; AVXONLY64-NEXT: vmovapd %ymm1, 32(%eax) 770; AVXONLY64-NEXT: movl %ebp, %esp 771; AVXONLY64-NEXT: popl %ebp 772; AVXONLY64-NEXT: retl 773; 774; AVX51232-LABEL: test_store_8xf64_aligned: 775; AVX51232: # BB#0: 776; AVX51232-NEXT: vaddpd %zmm1, %zmm0, %zmm0 777; AVX51232-NEXT: vmovapd %zmm0, (%rdi) 778; AVX51232-NEXT: retq 779; 780; AVX51264-LABEL: test_store_8xf64_aligned: 781; AVX51264: # BB#0: 782; AVX51264-NEXT: movl {{[0-9]+}}(%esp), %eax 783; AVX51264-NEXT: vaddpd %zmm1, %zmm0, %zmm0 784; AVX51264-NEXT: vmovapd %zmm0, (%eax) 785; AVX51264-NEXT: retl 786 %foo = fadd <8 x double> %value, %value2 ; to force dobule type on store 787 store <8 x double> %foo, <8 x double>* %addr, align 64 788 ret <8 x double> %foo 789} 790