1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s 3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s 4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s 5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s 6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s 7 8define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 9; GFX9-LABEL: store_lds_v3i32: 10; GFX9: ; %bb.0: 11; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 12; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 13; GFX9-NEXT: s_waitcnt lgkmcnt(0) 14; GFX9-NEXT: v_mov_b32_e32 v0, s4 15; GFX9-NEXT: v_mov_b32_e32 v1, s5 16; GFX9-NEXT: v_mov_b32_e32 v2, s6 17; GFX9-NEXT: v_mov_b32_e32 v3, s2 18; GFX9-NEXT: ds_write_b96 v3, v[0:2] 19; GFX9-NEXT: s_endpgm 20; 21; GFX7-LABEL: store_lds_v3i32: 22; GFX7: ; %bb.0: 23; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 24; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 25; GFX7-NEXT: s_mov_b32 m0, -1 26; GFX7-NEXT: s_waitcnt lgkmcnt(0) 27; GFX7-NEXT: v_mov_b32_e32 v0, s4 28; GFX7-NEXT: v_mov_b32_e32 v1, s5 29; GFX7-NEXT: v_mov_b32_e32 v2, s6 30; GFX7-NEXT: v_mov_b32_e32 v3, s0 31; GFX7-NEXT: ds_write_b96 v3, v[0:2] 32; GFX7-NEXT: s_endpgm 33; 34; GFX6-LABEL: store_lds_v3i32: 35; GFX6: ; %bb.0: 36; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 37; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 38; GFX6-NEXT: s_mov_b32 m0, -1 39; GFX6-NEXT: s_waitcnt lgkmcnt(0) 40; GFX6-NEXT: v_mov_b32_e32 v2, s4 41; GFX6-NEXT: v_mov_b32_e32 v1, s2 42; GFX6-NEXT: v_mov_b32_e32 v0, s0 43; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 44; GFX6-NEXT: v_mov_b32_e32 v1, s1 45; GFX6-NEXT: ds_write_b64 v2, v[0:1] 46; GFX6-NEXT: s_endpgm 47; 48; GFX10-LABEL: store_lds_v3i32: 49; GFX10: ; %bb.0: 50; GFX10-NEXT: s_clause 0x1 51; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 52; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 53; GFX10-NEXT: s_waitcnt lgkmcnt(0) 54; GFX10-NEXT: v_mov_b32_e32 v0, s4 55; GFX10-NEXT: v_mov_b32_e32 v1, s5 56; GFX10-NEXT: v_mov_b32_e32 v2, s6 57; GFX10-NEXT: v_mov_b32_e32 v3, s2 58; GFX10-NEXT: ds_write_b96 v3, v[0:2] 59; GFX10-NEXT: s_endpgm 60; 61; GFX11-LABEL: store_lds_v3i32: 62; GFX11: ; %bb.0: 63; GFX11-NEXT: s_clause 0x1 64; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 65; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 66; GFX11-NEXT: s_waitcnt lgkmcnt(0) 67; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 68; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 69; GFX11-NEXT: ds_store_b96 v3, v[0:2] 70; GFX11-NEXT: s_endpgm 71 store <3 x i32> %x, <3 x i32> addrspace(3)* %out 72 ret void 73} 74 75define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 76; GFX9-LABEL: store_lds_v3i32_align1: 77; GFX9: ; %bb.0: 78; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 79; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 80; GFX9-NEXT: s_waitcnt lgkmcnt(0) 81; GFX9-NEXT: v_mov_b32_e32 v0, s2 82; GFX9-NEXT: v_mov_b32_e32 v1, s6 83; GFX9-NEXT: v_mov_b32_e32 v2, s4 84; GFX9-NEXT: ds_write_b8 v0, v1 offset:8 85; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 86; GFX9-NEXT: ds_write_b8 v0, v2 87; GFX9-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 88; GFX9-NEXT: v_mov_b32_e32 v1, s5 89; GFX9-NEXT: s_lshr_b32 s0, s6, 8 90; GFX9-NEXT: ds_write_b8 v0, v1 offset:4 91; GFX9-NEXT: ds_write_b8_d16_hi v0, v1 offset:6 92; GFX9-NEXT: v_mov_b32_e32 v1, s0 93; GFX9-NEXT: s_lshr_b32 s0, s6, 24 94; GFX9-NEXT: ds_write_b8 v0, v1 offset:9 95; GFX9-NEXT: v_mov_b32_e32 v1, s0 96; GFX9-NEXT: s_lshr_b32 s0, s4, 8 97; GFX9-NEXT: ds_write_b8 v0, v1 offset:11 98; GFX9-NEXT: v_mov_b32_e32 v1, s0 99; GFX9-NEXT: s_lshr_b32 s0, s4, 24 100; GFX9-NEXT: ds_write_b8 v0, v1 offset:1 101; GFX9-NEXT: v_mov_b32_e32 v1, s0 102; GFX9-NEXT: s_lshr_b32 s0, s5, 8 103; GFX9-NEXT: ds_write_b8 v0, v1 offset:3 104; GFX9-NEXT: v_mov_b32_e32 v1, s0 105; GFX9-NEXT: s_lshr_b32 s0, s5, 24 106; GFX9-NEXT: ds_write_b8 v0, v1 offset:5 107; GFX9-NEXT: v_mov_b32_e32 v1, s0 108; GFX9-NEXT: ds_write_b8 v0, v1 offset:7 109; GFX9-NEXT: s_endpgm 110; 111; GFX7-LABEL: store_lds_v3i32_align1: 112; GFX7: ; %bb.0: 113; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 114; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 115; GFX7-NEXT: s_mov_b32 m0, -1 116; GFX7-NEXT: s_waitcnt lgkmcnt(0) 117; GFX7-NEXT: v_mov_b32_e32 v0, s4 118; GFX7-NEXT: v_mov_b32_e32 v1, s2 119; GFX7-NEXT: v_mov_b32_e32 v2, s0 120; GFX7-NEXT: ds_write_b8 v0, v1 offset:8 121; GFX7-NEXT: ds_write_b8 v0, v2 122; GFX7-NEXT: v_mov_b32_e32 v1, s1 123; GFX7-NEXT: s_lshr_b32 s3, s2, 8 124; GFX7-NEXT: ds_write_b8 v0, v1 offset:4 125; GFX7-NEXT: v_mov_b32_e32 v1, s3 126; GFX7-NEXT: s_lshr_b32 s3, s2, 24 127; GFX7-NEXT: ds_write_b8 v0, v1 offset:9 128; GFX7-NEXT: v_mov_b32_e32 v1, s3 129; GFX7-NEXT: s_lshr_b32 s2, s2, 16 130; GFX7-NEXT: ds_write_b8 v0, v1 offset:11 131; GFX7-NEXT: v_mov_b32_e32 v1, s2 132; GFX7-NEXT: s_lshr_b32 s2, s0, 8 133; GFX7-NEXT: ds_write_b8 v0, v1 offset:10 134; GFX7-NEXT: v_mov_b32_e32 v1, s2 135; GFX7-NEXT: s_lshr_b32 s2, s0, 24 136; GFX7-NEXT: ds_write_b8 v0, v1 offset:1 137; GFX7-NEXT: v_mov_b32_e32 v1, s2 138; GFX7-NEXT: s_lshr_b32 s0, s0, 16 139; GFX7-NEXT: ds_write_b8 v0, v1 offset:3 140; GFX7-NEXT: v_mov_b32_e32 v1, s0 141; GFX7-NEXT: s_lshr_b32 s0, s1, 8 142; GFX7-NEXT: ds_write_b8 v0, v1 offset:2 143; GFX7-NEXT: v_mov_b32_e32 v1, s0 144; GFX7-NEXT: s_lshr_b32 s0, s1, 24 145; GFX7-NEXT: ds_write_b8 v0, v1 offset:5 146; GFX7-NEXT: v_mov_b32_e32 v1, s0 147; GFX7-NEXT: s_lshr_b32 s0, s1, 16 148; GFX7-NEXT: ds_write_b8 v0, v1 offset:7 149; GFX7-NEXT: v_mov_b32_e32 v1, s0 150; GFX7-NEXT: ds_write_b8 v0, v1 offset:6 151; GFX7-NEXT: s_endpgm 152; 153; GFX6-LABEL: store_lds_v3i32_align1: 154; GFX6: ; %bb.0: 155; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 156; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 157; GFX6-NEXT: s_mov_b32 m0, -1 158; GFX6-NEXT: s_waitcnt lgkmcnt(0) 159; GFX6-NEXT: v_mov_b32_e32 v0, s4 160; GFX6-NEXT: v_mov_b32_e32 v1, s2 161; GFX6-NEXT: v_mov_b32_e32 v2, s0 162; GFX6-NEXT: ds_write_b8 v0, v1 offset:8 163; GFX6-NEXT: ds_write_b8 v0, v2 164; GFX6-NEXT: v_mov_b32_e32 v1, s1 165; GFX6-NEXT: s_lshr_b32 s3, s2, 8 166; GFX6-NEXT: ds_write_b8 v0, v1 offset:4 167; GFX6-NEXT: v_mov_b32_e32 v1, s3 168; GFX6-NEXT: s_lshr_b32 s3, s2, 24 169; GFX6-NEXT: ds_write_b8 v0, v1 offset:9 170; GFX6-NEXT: v_mov_b32_e32 v1, s3 171; GFX6-NEXT: s_lshr_b32 s2, s2, 16 172; GFX6-NEXT: ds_write_b8 v0, v1 offset:11 173; GFX6-NEXT: v_mov_b32_e32 v1, s2 174; GFX6-NEXT: s_lshr_b32 s2, s0, 8 175; GFX6-NEXT: ds_write_b8 v0, v1 offset:10 176; GFX6-NEXT: v_mov_b32_e32 v1, s2 177; GFX6-NEXT: s_lshr_b32 s2, s0, 24 178; GFX6-NEXT: ds_write_b8 v0, v1 offset:1 179; GFX6-NEXT: v_mov_b32_e32 v1, s2 180; GFX6-NEXT: s_lshr_b32 s0, s0, 16 181; GFX6-NEXT: ds_write_b8 v0, v1 offset:3 182; GFX6-NEXT: v_mov_b32_e32 v1, s0 183; GFX6-NEXT: s_lshr_b32 s0, s1, 8 184; GFX6-NEXT: ds_write_b8 v0, v1 offset:2 185; GFX6-NEXT: v_mov_b32_e32 v1, s0 186; GFX6-NEXT: s_lshr_b32 s0, s1, 24 187; GFX6-NEXT: ds_write_b8 v0, v1 offset:5 188; GFX6-NEXT: v_mov_b32_e32 v1, s0 189; GFX6-NEXT: s_lshr_b32 s0, s1, 16 190; GFX6-NEXT: ds_write_b8 v0, v1 offset:7 191; GFX6-NEXT: v_mov_b32_e32 v1, s0 192; GFX6-NEXT: ds_write_b8 v0, v1 offset:6 193; GFX6-NEXT: s_endpgm 194; 195; GFX10-LABEL: store_lds_v3i32_align1: 196; GFX10: ; %bb.0: 197; GFX10-NEXT: s_clause 0x1 198; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 199; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 200; GFX10-NEXT: s_waitcnt lgkmcnt(0) 201; GFX10-NEXT: v_mov_b32_e32 v0, s2 202; GFX10-NEXT: v_mov_b32_e32 v1, s6 203; GFX10-NEXT: v_mov_b32_e32 v2, s4 204; GFX10-NEXT: v_mov_b32_e32 v3, s5 205; GFX10-NEXT: s_lshr_b32 s0, s6, 8 206; GFX10-NEXT: s_lshr_b32 s1, s6, 24 207; GFX10-NEXT: s_lshr_b32 s2, s4, 8 208; GFX10-NEXT: s_lshr_b32 s3, s4, 24 209; GFX10-NEXT: s_lshr_b32 s4, s5, 8 210; GFX10-NEXT: s_lshr_b32 s5, s5, 24 211; GFX10-NEXT: v_mov_b32_e32 v4, s0 212; GFX10-NEXT: v_mov_b32_e32 v5, s1 213; GFX10-NEXT: v_mov_b32_e32 v6, s2 214; GFX10-NEXT: v_mov_b32_e32 v7, s3 215; GFX10-NEXT: v_mov_b32_e32 v8, s4 216; GFX10-NEXT: v_mov_b32_e32 v9, s5 217; GFX10-NEXT: ds_write_b8 v0, v1 offset:8 218; GFX10-NEXT: ds_write_b8_d16_hi v0, v1 offset:10 219; GFX10-NEXT: ds_write_b8 v0, v2 220; GFX10-NEXT: ds_write_b8_d16_hi v0, v2 offset:2 221; GFX10-NEXT: ds_write_b8 v0, v3 offset:4 222; GFX10-NEXT: ds_write_b8_d16_hi v0, v3 offset:6 223; GFX10-NEXT: ds_write_b8 v0, v4 offset:9 224; GFX10-NEXT: ds_write_b8 v0, v5 offset:11 225; GFX10-NEXT: ds_write_b8 v0, v6 offset:1 226; GFX10-NEXT: ds_write_b8 v0, v7 offset:3 227; GFX10-NEXT: ds_write_b8 v0, v8 offset:5 228; GFX10-NEXT: ds_write_b8 v0, v9 offset:7 229; GFX10-NEXT: s_endpgm 230; 231; GFX11-LABEL: store_lds_v3i32_align1: 232; GFX11: ; %bb.0: 233; GFX11-NEXT: s_clause 0x1 234; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 235; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 236; GFX11-NEXT: s_waitcnt lgkmcnt(0) 237; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 238; GFX11-NEXT: s_lshr_b32 s3, s2, 8 239; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 240; GFX11-NEXT: s_lshr_b32 s2, s2, 24 241; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 242; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2 243; GFX11-NEXT: s_lshr_b32 s4, s0, 8 244; GFX11-NEXT: s_lshr_b32 s0, s0, 24 245; GFX11-NEXT: s_lshr_b32 s5, s1, 8 246; GFX11-NEXT: s_lshr_b32 s1, s1, 24 247; GFX11-NEXT: v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0 248; GFX11-NEXT: v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1 249; GFX11-NEXT: ds_store_b8 v0, v1 offset:8 250; GFX11-NEXT: ds_store_b8 v0, v2 251; GFX11-NEXT: ds_store_b8 v0, v4 offset:9 252; GFX11-NEXT: ds_store_b8_d16_hi v0, v1 offset:10 253; GFX11-NEXT: ds_store_b8 v0, v5 offset:11 254; GFX11-NEXT: ds_store_b8_d16_hi v0, v2 offset:2 255; GFX11-NEXT: ds_store_b8 v0, v6 offset:1 256; GFX11-NEXT: ds_store_b8 v0, v3 offset:4 257; GFX11-NEXT: ds_store_b8 v0, v7 offset:3 258; GFX11-NEXT: ds_store_b8 v0, v8 offset:5 259; GFX11-NEXT: ds_store_b8_d16_hi v0, v3 offset:6 260; GFX11-NEXT: ds_store_b8 v0, v9 offset:7 261; GFX11-NEXT: s_endpgm 262 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1 263 ret void 264} 265 266define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 267; GFX9-LABEL: store_lds_v3i32_align2: 268; GFX9: ; %bb.0: 269; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 270; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 271; GFX9-NEXT: s_waitcnt lgkmcnt(0) 272; GFX9-NEXT: v_mov_b32_e32 v0, s2 273; GFX9-NEXT: v_mov_b32_e32 v1, s6 274; GFX9-NEXT: v_mov_b32_e32 v2, s4 275; GFX9-NEXT: ds_write_b16 v0, v1 offset:8 276; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 277; GFX9-NEXT: ds_write_b16 v0, v2 278; GFX9-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 279; GFX9-NEXT: v_mov_b32_e32 v1, s5 280; GFX9-NEXT: ds_write_b16 v0, v1 offset:4 281; GFX9-NEXT: ds_write_b16_d16_hi v0, v1 offset:6 282; GFX9-NEXT: s_endpgm 283; 284; GFX7-LABEL: store_lds_v3i32_align2: 285; GFX7: ; %bb.0: 286; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 287; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 288; GFX7-NEXT: s_mov_b32 m0, -1 289; GFX7-NEXT: s_waitcnt lgkmcnt(0) 290; GFX7-NEXT: v_mov_b32_e32 v0, s4 291; GFX7-NEXT: v_mov_b32_e32 v1, s2 292; GFX7-NEXT: v_mov_b32_e32 v2, s0 293; GFX7-NEXT: ds_write_b16 v0, v1 offset:8 294; GFX7-NEXT: ds_write_b16 v0, v2 295; GFX7-NEXT: v_mov_b32_e32 v1, s1 296; GFX7-NEXT: s_lshr_b32 s2, s2, 16 297; GFX7-NEXT: ds_write_b16 v0, v1 offset:4 298; GFX7-NEXT: v_mov_b32_e32 v1, s2 299; GFX7-NEXT: s_lshr_b32 s0, s0, 16 300; GFX7-NEXT: ds_write_b16 v0, v1 offset:10 301; GFX7-NEXT: v_mov_b32_e32 v1, s0 302; GFX7-NEXT: s_lshr_b32 s0, s1, 16 303; GFX7-NEXT: ds_write_b16 v0, v1 offset:2 304; GFX7-NEXT: v_mov_b32_e32 v1, s0 305; GFX7-NEXT: ds_write_b16 v0, v1 offset:6 306; GFX7-NEXT: s_endpgm 307; 308; GFX6-LABEL: store_lds_v3i32_align2: 309; GFX6: ; %bb.0: 310; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 311; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 312; GFX6-NEXT: s_mov_b32 m0, -1 313; GFX6-NEXT: s_waitcnt lgkmcnt(0) 314; GFX6-NEXT: v_mov_b32_e32 v0, s4 315; GFX6-NEXT: v_mov_b32_e32 v1, s2 316; GFX6-NEXT: v_mov_b32_e32 v2, s0 317; GFX6-NEXT: ds_write_b16 v0, v1 offset:8 318; GFX6-NEXT: ds_write_b16 v0, v2 319; GFX6-NEXT: v_mov_b32_e32 v1, s1 320; GFX6-NEXT: s_lshr_b32 s2, s2, 16 321; GFX6-NEXT: ds_write_b16 v0, v1 offset:4 322; GFX6-NEXT: v_mov_b32_e32 v1, s2 323; GFX6-NEXT: s_lshr_b32 s0, s0, 16 324; GFX6-NEXT: ds_write_b16 v0, v1 offset:10 325; GFX6-NEXT: v_mov_b32_e32 v1, s0 326; GFX6-NEXT: s_lshr_b32 s0, s1, 16 327; GFX6-NEXT: ds_write_b16 v0, v1 offset:2 328; GFX6-NEXT: v_mov_b32_e32 v1, s0 329; GFX6-NEXT: ds_write_b16 v0, v1 offset:6 330; GFX6-NEXT: s_endpgm 331; 332; GFX10-LABEL: store_lds_v3i32_align2: 333; GFX10: ; %bb.0: 334; GFX10-NEXT: s_clause 0x1 335; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 336; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 337; GFX10-NEXT: s_waitcnt lgkmcnt(0) 338; GFX10-NEXT: v_mov_b32_e32 v0, s2 339; GFX10-NEXT: v_mov_b32_e32 v1, s6 340; GFX10-NEXT: v_mov_b32_e32 v2, s4 341; GFX10-NEXT: v_mov_b32_e32 v3, s5 342; GFX10-NEXT: ds_write_b16 v0, v1 offset:8 343; GFX10-NEXT: ds_write_b16_d16_hi v0, v1 offset:10 344; GFX10-NEXT: ds_write_b16 v0, v2 345; GFX10-NEXT: ds_write_b16_d16_hi v0, v2 offset:2 346; GFX10-NEXT: ds_write_b16 v0, v3 offset:4 347; GFX10-NEXT: ds_write_b16_d16_hi v0, v3 offset:6 348; GFX10-NEXT: s_endpgm 349; 350; GFX11-LABEL: store_lds_v3i32_align2: 351; GFX11: ; %bb.0: 352; GFX11-NEXT: s_clause 0x1 353; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 354; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 355; GFX11-NEXT: s_waitcnt lgkmcnt(0) 356; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2 357; GFX11-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1 358; GFX11-NEXT: ds_store_b16_d16_hi v0, v1 offset:10 359; GFX11-NEXT: ds_store_b16 v0, v2 360; GFX11-NEXT: ds_store_b16_d16_hi v0, v2 offset:2 361; GFX11-NEXT: ds_store_b16 v0, v3 offset:4 362; GFX11-NEXT: ds_store_b16 v0, v1 offset:8 363; GFX11-NEXT: ds_store_b16_d16_hi v0, v3 offset:6 364; GFX11-NEXT: s_endpgm 365 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2 366 ret void 367} 368 369define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 370; GFX9-LABEL: store_lds_v3i32_align4: 371; GFX9: ; %bb.0: 372; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 373; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 374; GFX9-NEXT: s_waitcnt lgkmcnt(0) 375; GFX9-NEXT: v_mov_b32_e32 v0, s2 376; GFX9-NEXT: v_mov_b32_e32 v1, s4 377; GFX9-NEXT: v_mov_b32_e32 v2, s5 378; GFX9-NEXT: v_mov_b32_e32 v3, s6 379; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 380; GFX9-NEXT: ds_write_b32 v0, v3 offset:8 381; GFX9-NEXT: s_endpgm 382; 383; GFX7-LABEL: store_lds_v3i32_align4: 384; GFX7: ; %bb.0: 385; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 386; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 387; GFX7-NEXT: s_mov_b32 m0, -1 388; GFX7-NEXT: s_waitcnt lgkmcnt(0) 389; GFX7-NEXT: v_mov_b32_e32 v0, s4 390; GFX7-NEXT: v_mov_b32_e32 v1, s0 391; GFX7-NEXT: v_mov_b32_e32 v2, s1 392; GFX7-NEXT: ds_write2_b32 v0, v1, v2 offset1:1 393; GFX7-NEXT: v_mov_b32_e32 v1, s2 394; GFX7-NEXT: ds_write_b32 v0, v1 offset:8 395; GFX7-NEXT: s_endpgm 396; 397; GFX6-LABEL: store_lds_v3i32_align4: 398; GFX6: ; %bb.0: 399; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 400; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 401; GFX6-NEXT: s_mov_b32 m0, -1 402; GFX6-NEXT: s_waitcnt lgkmcnt(0) 403; GFX6-NEXT: v_mov_b32_e32 v0, s4 404; GFX6-NEXT: v_mov_b32_e32 v1, s2 405; GFX6-NEXT: v_mov_b32_e32 v2, s0 406; GFX6-NEXT: ds_write_b32 v0, v1 offset:8 407; GFX6-NEXT: v_mov_b32_e32 v1, s1 408; GFX6-NEXT: ds_write2_b32 v0, v2, v1 offset1:1 409; GFX6-NEXT: s_endpgm 410; 411; GFX10-LABEL: store_lds_v3i32_align4: 412; GFX10: ; %bb.0: 413; GFX10-NEXT: s_clause 0x1 414; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 415; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 416; GFX10-NEXT: s_waitcnt lgkmcnt(0) 417; GFX10-NEXT: v_mov_b32_e32 v0, s2 418; GFX10-NEXT: v_mov_b32_e32 v1, s6 419; GFX10-NEXT: v_mov_b32_e32 v2, s4 420; GFX10-NEXT: v_mov_b32_e32 v3, s5 421; GFX10-NEXT: ds_write_b32 v0, v1 offset:8 422; GFX10-NEXT: ds_write2_b32 v0, v2, v3 offset1:1 423; GFX10-NEXT: s_endpgm 424; 425; GFX11-LABEL: store_lds_v3i32_align4: 426; GFX11: ; %bb.0: 427; GFX11-NEXT: s_clause 0x1 428; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 429; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 430; GFX11-NEXT: s_waitcnt lgkmcnt(0) 431; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0 432; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2 433; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset1:1 434; GFX11-NEXT: ds_store_b32 v0, v3 offset:8 435; GFX11-NEXT: s_endpgm 436 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4 437 ret void 438} 439 440define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 441; GFX9-LABEL: store_lds_v3i32_align8: 442; GFX9: ; %bb.0: 443; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 444; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 445; GFX9-NEXT: s_waitcnt lgkmcnt(0) 446; GFX9-NEXT: v_mov_b32_e32 v2, s2 447; GFX9-NEXT: v_mov_b32_e32 v3, s6 448; GFX9-NEXT: v_mov_b32_e32 v0, s4 449; GFX9-NEXT: v_mov_b32_e32 v1, s5 450; GFX9-NEXT: ds_write_b32 v2, v3 offset:8 451; GFX9-NEXT: ds_write_b64 v2, v[0:1] 452; GFX9-NEXT: s_endpgm 453; 454; GFX7-LABEL: store_lds_v3i32_align8: 455; GFX7: ; %bb.0: 456; GFX7-NEXT: s_load_dword s4, s[0:1], 0x0 457; GFX7-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 458; GFX7-NEXT: s_mov_b32 m0, -1 459; GFX7-NEXT: s_waitcnt lgkmcnt(0) 460; GFX7-NEXT: v_mov_b32_e32 v2, s4 461; GFX7-NEXT: v_mov_b32_e32 v1, s2 462; GFX7-NEXT: v_mov_b32_e32 v0, s0 463; GFX7-NEXT: ds_write_b32 v2, v1 offset:8 464; GFX7-NEXT: v_mov_b32_e32 v1, s1 465; GFX7-NEXT: ds_write_b64 v2, v[0:1] 466; GFX7-NEXT: s_endpgm 467; 468; GFX6-LABEL: store_lds_v3i32_align8: 469; GFX6: ; %bb.0: 470; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 471; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 472; GFX6-NEXT: s_mov_b32 m0, -1 473; GFX6-NEXT: s_waitcnt lgkmcnt(0) 474; GFX6-NEXT: v_mov_b32_e32 v2, s4 475; GFX6-NEXT: v_mov_b32_e32 v1, s2 476; GFX6-NEXT: v_mov_b32_e32 v0, s0 477; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 478; GFX6-NEXT: v_mov_b32_e32 v1, s1 479; GFX6-NEXT: ds_write_b64 v2, v[0:1] 480; GFX6-NEXT: s_endpgm 481; 482; GFX10-LABEL: store_lds_v3i32_align8: 483; GFX10: ; %bb.0: 484; GFX10-NEXT: s_clause 0x1 485; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 486; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 487; GFX10-NEXT: s_waitcnt lgkmcnt(0) 488; GFX10-NEXT: v_mov_b32_e32 v2, s2 489; GFX10-NEXT: v_mov_b32_e32 v3, s6 490; GFX10-NEXT: v_mov_b32_e32 v0, s4 491; GFX10-NEXT: v_mov_b32_e32 v1, s5 492; GFX10-NEXT: ds_write_b32 v2, v3 offset:8 493; GFX10-NEXT: ds_write_b64 v2, v[0:1] 494; GFX10-NEXT: s_endpgm 495; 496; GFX11-LABEL: store_lds_v3i32_align8: 497; GFX11: ; %bb.0: 498; GFX11-NEXT: s_clause 0x1 499; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 500; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x10 501; GFX11-NEXT: s_waitcnt lgkmcnt(0) 502; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2 503; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 504; GFX11-NEXT: ds_store_b32 v2, v3 offset:8 505; GFX11-NEXT: ds_store_b64 v2, v[0:1] 506; GFX11-NEXT: s_endpgm 507 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8 508 ret void 509} 510 511define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) { 512; GFX9-LABEL: store_lds_v3i32_align16: 513; GFX9: ; %bb.0: 514; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 515; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 516; GFX9-NEXT: s_waitcnt lgkmcnt(0) 517; GFX9-NEXT: v_mov_b32_e32 v0, s4 518; GFX9-NEXT: v_mov_b32_e32 v1, s5 519; GFX9-NEXT: v_mov_b32_e32 v2, s6 520; GFX9-NEXT: v_mov_b32_e32 v3, s2 521; GFX9-NEXT: ds_write_b96 v3, v[0:2] 522; GFX9-NEXT: s_endpgm 523; 524; GFX7-LABEL: store_lds_v3i32_align16: 525; GFX7: ; %bb.0: 526; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x4 527; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 528; GFX7-NEXT: s_mov_b32 m0, -1 529; GFX7-NEXT: s_waitcnt lgkmcnt(0) 530; GFX7-NEXT: v_mov_b32_e32 v0, s4 531; GFX7-NEXT: v_mov_b32_e32 v1, s5 532; GFX7-NEXT: v_mov_b32_e32 v2, s6 533; GFX7-NEXT: v_mov_b32_e32 v3, s0 534; GFX7-NEXT: ds_write_b96 v3, v[0:2] 535; GFX7-NEXT: s_endpgm 536; 537; GFX6-LABEL: store_lds_v3i32_align16: 538; GFX6: ; %bb.0: 539; GFX6-NEXT: s_load_dword s4, s[0:1], 0x0 540; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x4 541; GFX6-NEXT: s_mov_b32 m0, -1 542; GFX6-NEXT: s_waitcnt lgkmcnt(0) 543; GFX6-NEXT: v_mov_b32_e32 v2, s4 544; GFX6-NEXT: v_mov_b32_e32 v1, s2 545; GFX6-NEXT: v_mov_b32_e32 v0, s0 546; GFX6-NEXT: ds_write_b32 v2, v1 offset:8 547; GFX6-NEXT: v_mov_b32_e32 v1, s1 548; GFX6-NEXT: ds_write_b64 v2, v[0:1] 549; GFX6-NEXT: s_endpgm 550; 551; GFX10-LABEL: store_lds_v3i32_align16: 552; GFX10: ; %bb.0: 553; GFX10-NEXT: s_clause 0x1 554; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10 555; GFX10-NEXT: s_load_dword s2, s[0:1], 0x0 556; GFX10-NEXT: s_waitcnt lgkmcnt(0) 557; GFX10-NEXT: v_mov_b32_e32 v0, s4 558; GFX10-NEXT: v_mov_b32_e32 v1, s5 559; GFX10-NEXT: v_mov_b32_e32 v2, s6 560; GFX10-NEXT: v_mov_b32_e32 v3, s2 561; GFX10-NEXT: ds_write_b96 v3, v[0:2] 562; GFX10-NEXT: s_endpgm 563; 564; GFX11-LABEL: store_lds_v3i32_align16: 565; GFX11: ; %bb.0: 566; GFX11-NEXT: s_clause 0x1 567; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x10 568; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x0 569; GFX11-NEXT: s_waitcnt lgkmcnt(0) 570; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 571; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0 572; GFX11-NEXT: ds_store_b96 v3, v[0:2] 573; GFX11-NEXT: s_endpgm 574 store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16 575 ret void 576} 577