1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s 3; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s 4; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s 5; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s 6 7define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) { 8; GFX9-LABEL: store_load_sindex_kernel: 9; GFX9: ; %bb.0: ; %bb 10; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 11; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 12; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 13; GFX9-NEXT: v_mov_b32_e32 v0, 15 14; GFX9-NEXT: s_waitcnt lgkmcnt(0) 15; GFX9-NEXT: s_lshl_b32 s1, s0, 2 16; GFX9-NEXT: s_and_b32 s0, s0, 15 17; GFX9-NEXT: s_add_i32 s1, s1, 4 18; GFX9-NEXT: s_lshl_b32 s0, s0, 2 19; GFX9-NEXT: scratch_store_dword off, v0, s1 20; GFX9-NEXT: s_waitcnt vmcnt(0) 21; GFX9-NEXT: s_add_i32 s0, s0, 4 22; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 23; GFX9-NEXT: s_waitcnt vmcnt(0) 24; GFX9-NEXT: s_endpgm 25; 26; GFX10-LABEL: store_load_sindex_kernel: 27; GFX10: ; %bb.0: ; %bb 28; GFX10-NEXT: s_add_u32 s2, s2, s5 29; GFX10-NEXT: s_addc_u32 s3, s3, 0 30; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 31; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 32; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 33; GFX10-NEXT: v_mov_b32_e32 v0, 15 34; GFX10-NEXT: s_waitcnt lgkmcnt(0) 35; GFX10-NEXT: s_and_b32 s1, s0, 15 36; GFX10-NEXT: s_lshl_b32 s0, s0, 2 37; GFX10-NEXT: s_lshl_b32 s1, s1, 2 38; GFX10-NEXT: s_add_i32 s0, s0, 4 39; GFX10-NEXT: s_add_i32 s1, s1, 4 40; GFX10-NEXT: scratch_store_dword off, v0, s0 41; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 42; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 43; GFX10-NEXT: s_waitcnt vmcnt(0) 44; GFX10-NEXT: s_endpgm 45; 46; GFX940-LABEL: store_load_sindex_kernel: 47; GFX940: ; %bb.0: ; %bb 48; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 49; GFX940-NEXT: v_mov_b32_e32 v0, 15 50; GFX940-NEXT: s_waitcnt lgkmcnt(0) 51; GFX940-NEXT: s_lshl_b32 s1, s0, 2 52; GFX940-NEXT: s_and_b32 s0, s0, 15 53; GFX940-NEXT: v_mov_b32_e32 v1, s1 54; GFX940-NEXT: s_lshl_b32 s0, s0, 2 55; GFX940-NEXT: scratch_store_dword v1, v0, off offset:4 sc0 sc1 56; GFX940-NEXT: s_waitcnt vmcnt(0) 57; GFX940-NEXT: v_mov_b32_e32 v0, s0 58; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 59; GFX940-NEXT: s_waitcnt vmcnt(0) 60; GFX940-NEXT: s_endpgm 61; 62; GFX11-LABEL: store_load_sindex_kernel: 63; GFX11: ; %bb.0: ; %bb 64; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 65; GFX11-NEXT: v_mov_b32_e32 v1, 15 66; GFX11-NEXT: s_waitcnt lgkmcnt(0) 67; GFX11-NEXT: s_lshl_b32 s1, s0, 2 68; GFX11-NEXT: s_and_b32 s0, s0, 15 69; GFX11-NEXT: v_mov_b32_e32 v0, s1 70; GFX11-NEXT: s_lshl_b32 s0, s0, 2 71; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 72; GFX11-NEXT: v_mov_b32_e32 v2, s0 73; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc 74; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 75; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:4 glc dlc 76; GFX11-NEXT: s_waitcnt vmcnt(0) 77; GFX11-NEXT: s_endpgm 78bb: 79 %i = alloca [32 x float], align 4, addrspace(5) 80 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 81 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 82 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 83 store volatile i32 15, i32 addrspace(5)* %i8, align 4 84 %i9 = and i32 %idx, 15 85 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 86 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 87 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 88 ret void 89} 90 91define amdgpu_kernel void @store_load_vindex_kernel() { 92; GFX9-LABEL: store_load_vindex_kernel: 93; GFX9: ; %bb.0: ; %bb 94; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 95; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 96; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 97; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 98; GFX9-NEXT: v_add_u32_e32 v1, 4, v1 99; GFX9-NEXT: v_mov_b32_e32 v2, 15 100; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 101; GFX9-NEXT: scratch_store_dword v1, v2, off 102; GFX9-NEXT: s_waitcnt vmcnt(0) 103; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 104; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 105; GFX9-NEXT: s_waitcnt vmcnt(0) 106; GFX9-NEXT: s_endpgm 107; 108; GFX10-LABEL: store_load_vindex_kernel: 109; GFX10: ; %bb.0: ; %bb 110; GFX10-NEXT: s_add_u32 s0, s0, s3 111; GFX10-NEXT: s_addc_u32 s1, s1, 0 112; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 113; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 114; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 115; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 116; GFX10-NEXT: v_mov_b32_e32 v2, 15 117; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 118; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 119; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v1 120; GFX10-NEXT: scratch_store_dword v0, v2, off 121; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 122; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 123; GFX10-NEXT: s_waitcnt vmcnt(0) 124; GFX10-NEXT: s_endpgm 125; 126; GFX940-LABEL: store_load_vindex_kernel: 127; GFX940: ; %bb.0: ; %bb 128; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 129; GFX940-NEXT: v_mov_b32_e32 v2, 15 130; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 131; GFX940-NEXT: scratch_store_dword v1, v2, off offset:4 sc0 sc1 132; GFX940-NEXT: s_waitcnt vmcnt(0) 133; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 134; GFX940-NEXT: scratch_load_dword v0, v0, off offset:128 sc0 sc1 135; GFX940-NEXT: s_waitcnt vmcnt(0) 136; GFX940-NEXT: s_endpgm 137; 138; GFX11-LABEL: store_load_vindex_kernel: 139; GFX11: ; %bb.0: ; %bb 140; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 141; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 142; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 143; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v1 144; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc 145; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 146; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:128 glc dlc 147; GFX11-NEXT: s_waitcnt vmcnt(0) 148; GFX11-NEXT: s_endpgm 149bb: 150 %i = alloca [32 x float], align 4, addrspace(5) 151 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 152 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 153 %i3 = zext i32 %i2 to i64 154 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 155 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 156 store volatile i32 15, i32 addrspace(5)* %i8, align 4 157 %i9 = sub nsw i32 31, %i2 158 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 159 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 160 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 161 ret void 162} 163 164define void @store_load_vindex_foo(i32 %idx) { 165; GFX9-LABEL: store_load_vindex_foo: 166; GFX9: ; %bb.0: ; %bb 167; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 168; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 169; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 170; GFX9-NEXT: v_add_u32_e32 v1, s32, v1 171; GFX9-NEXT: v_mov_b32_e32 v2, 15 172; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 173; GFX9-NEXT: scratch_store_dword v1, v2, off 174; GFX9-NEXT: s_waitcnt vmcnt(0) 175; GFX9-NEXT: v_add_u32_e32 v0, s32, v0 176; GFX9-NEXT: scratch_load_dword v0, v0, off glc 177; GFX9-NEXT: s_waitcnt vmcnt(0) 178; GFX9-NEXT: s_setpc_b64 s[30:31] 179; 180; GFX10-LABEL: store_load_vindex_foo: 181; GFX10: ; %bb.0: ; %bb 182; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 183; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 184; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 185; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 186; GFX10-NEXT: v_mov_b32_e32 v2, 15 187; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 188; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0 189; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1 190; GFX10-NEXT: scratch_store_dword v0, v2, off 191; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 192; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 193; GFX10-NEXT: s_waitcnt vmcnt(0) 194; GFX10-NEXT: s_setpc_b64 s[30:31] 195; 196; GFX940-LABEL: store_load_vindex_foo: 197; GFX940: ; %bb.0: ; %bb 198; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 199; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 200; GFX940-NEXT: v_mov_b32_e32 v2, 15 201; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 202; GFX940-NEXT: scratch_store_dword v1, v2, s32 sc0 sc1 203; GFX940-NEXT: s_waitcnt vmcnt(0) 204; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 205; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1 206; GFX940-NEXT: s_waitcnt vmcnt(0) 207; GFX940-NEXT: s_setpc_b64 s[30:31] 208; 209; GFX11-LABEL: store_load_vindex_foo: 210; GFX11: ; %bb.0: ; %bb 211; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 212; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 213; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 214; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 215; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) 216; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 217; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc 218; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 219; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc 220; GFX11-NEXT: s_waitcnt vmcnt(0) 221; GFX11-NEXT: s_setpc_b64 s[30:31] 222bb: 223 %i = alloca [32 x float], align 4, addrspace(5) 224 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 225 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 226 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 227 store volatile i32 15, i32 addrspace(5)* %i8, align 4 228 %i9 = and i32 %idx, 15 229 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 230 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 231 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 232 ret void 233} 234 235define void @private_ptr_foo(float addrspace(5)* nocapture %arg) { 236; GFX9-LABEL: private_ptr_foo: 237; GFX9: ; %bb.0: 238; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 239; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000 240; GFX9-NEXT: scratch_store_dword v0, v1, off offset:4 241; GFX9-NEXT: s_waitcnt vmcnt(0) 242; GFX9-NEXT: s_setpc_b64 s[30:31] 243; 244; GFX10-LABEL: private_ptr_foo: 245; GFX10: ; %bb.0: 246; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 248; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000 249; GFX10-NEXT: scratch_store_dword v0, v1, off offset:4 250; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 251; GFX10-NEXT: s_setpc_b64 s[30:31] 252; 253; GFX940-LABEL: private_ptr_foo: 254; GFX940: ; %bb.0: 255; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 256; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000 257; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 258; GFX940-NEXT: s_waitcnt vmcnt(0) 259; GFX940-NEXT: s_setpc_b64 s[30:31] 260; 261; GFX11-LABEL: private_ptr_foo: 262; GFX11: ; %bb.0: 263; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 264; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 265; GFX11-NEXT: v_mov_b32_e32 v1, 0x41200000 266; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 267; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 268; GFX11-NEXT: s_setpc_b64 s[30:31] 269 %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1 270 store float 1.000000e+01, float addrspace(5)* %gep, align 4 271 ret void 272} 273 274define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) { 275; GFX9-LABEL: store_load_sindex_small_offset_kernel: 276; GFX9: ; %bb.0: ; %bb 277; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 278; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 279; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 280; GFX9-NEXT: s_mov_b32 vcc_hi, 0 281; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 282; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 283; GFX9-NEXT: s_lshl_b32 s1, s0, 2 284; GFX9-NEXT: s_and_b32 s0, s0, 15 285; GFX9-NEXT: v_mov_b32_e32 v0, 15 286; GFX9-NEXT: s_addk_i32 s1, 0x104 287; GFX9-NEXT: s_lshl_b32 s0, s0, 2 288; GFX9-NEXT: scratch_store_dword off, v0, s1 289; GFX9-NEXT: s_waitcnt vmcnt(0) 290; GFX9-NEXT: s_addk_i32 s0, 0x104 291; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 292; GFX9-NEXT: s_waitcnt vmcnt(0) 293; GFX9-NEXT: s_endpgm 294; 295; GFX10-LABEL: store_load_sindex_small_offset_kernel: 296; GFX10: ; %bb.0: ; %bb 297; GFX10-NEXT: s_add_u32 s2, s2, s5 298; GFX10-NEXT: s_addc_u32 s3, s3, 0 299; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 300; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 301; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 302; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 303; GFX10-NEXT: s_waitcnt vmcnt(0) 304; GFX10-NEXT: v_mov_b32_e32 v0, 15 305; GFX10-NEXT: s_waitcnt lgkmcnt(0) 306; GFX10-NEXT: s_and_b32 s1, s0, 15 307; GFX10-NEXT: s_lshl_b32 s0, s0, 2 308; GFX10-NEXT: s_lshl_b32 s1, s1, 2 309; GFX10-NEXT: s_addk_i32 s0, 0x104 310; GFX10-NEXT: s_addk_i32 s1, 0x104 311; GFX10-NEXT: scratch_store_dword off, v0, s0 312; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 313; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 314; GFX10-NEXT: s_waitcnt vmcnt(0) 315; GFX10-NEXT: s_endpgm 316; 317; GFX940-LABEL: store_load_sindex_small_offset_kernel: 318; GFX940: ; %bb.0: ; %bb 319; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 320; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 321; GFX940-NEXT: s_waitcnt vmcnt(0) 322; GFX940-NEXT: v_mov_b32_e32 v0, 15 323; GFX940-NEXT: s_waitcnt lgkmcnt(0) 324; GFX940-NEXT: s_lshl_b32 s1, s0, 2 325; GFX940-NEXT: s_and_b32 s0, s0, 15 326; GFX940-NEXT: v_mov_b32_e32 v1, s1 327; GFX940-NEXT: s_lshl_b32 s0, s0, 2 328; GFX940-NEXT: scratch_store_dword v1, v0, off offset:260 sc0 sc1 329; GFX940-NEXT: s_waitcnt vmcnt(0) 330; GFX940-NEXT: v_mov_b32_e32 v0, s0 331; GFX940-NEXT: scratch_load_dword v0, v0, off offset:260 sc0 sc1 332; GFX940-NEXT: s_waitcnt vmcnt(0) 333; GFX940-NEXT: s_endpgm 334; 335; GFX11-LABEL: store_load_sindex_small_offset_kernel: 336; GFX11: ; %bb.0: ; %bb 337; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 338; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 339; GFX11-NEXT: s_waitcnt vmcnt(0) 340; GFX11-NEXT: v_mov_b32_e32 v1, 15 341; GFX11-NEXT: s_waitcnt lgkmcnt(0) 342; GFX11-NEXT: s_lshl_b32 s1, s0, 2 343; GFX11-NEXT: s_and_b32 s0, s0, 15 344; GFX11-NEXT: v_mov_b32_e32 v0, s1 345; GFX11-NEXT: s_lshl_b32 s0, s0, 2 346; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 347; GFX11-NEXT: v_mov_b32_e32 v2, s0 348; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:260 dlc 349; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 350; GFX11-NEXT: scratch_load_b32 v0, v2, off offset:260 glc dlc 351; GFX11-NEXT: s_waitcnt vmcnt(0) 352; GFX11-NEXT: s_endpgm 353bb: 354 %padding = alloca [64 x i32], align 4, addrspace(5) 355 %i = alloca [32 x float], align 4, addrspace(5) 356 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 357 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 358 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 359 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 360 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 361 store volatile i32 15, i32 addrspace(5)* %i8, align 4 362 %i9 = and i32 %idx, 15 363 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 364 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 365 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 366 ret void 367} 368 369define amdgpu_kernel void @store_load_vindex_small_offset_kernel() { 370; GFX9-LABEL: store_load_vindex_small_offset_kernel: 371; GFX9: ; %bb.0: ; %bb 372; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 373; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 374; GFX9-NEXT: s_mov_b32 vcc_hi, 0 375; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 376; GFX9-NEXT: s_waitcnt vmcnt(0) 377; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 378; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 379; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1 380; GFX9-NEXT: v_mov_b32_e32 v2, 15 381; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 382; GFX9-NEXT: scratch_store_dword v1, v2, off 383; GFX9-NEXT: s_waitcnt vmcnt(0) 384; GFX9-NEXT: v_add_u32_e32 v0, 0x104, v0 385; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 386; GFX9-NEXT: s_waitcnt vmcnt(0) 387; GFX9-NEXT: s_endpgm 388; 389; GFX10-LABEL: store_load_vindex_small_offset_kernel: 390; GFX10: ; %bb.0: ; %bb 391; GFX10-NEXT: s_add_u32 s0, s0, s3 392; GFX10-NEXT: s_addc_u32 s1, s1, 0 393; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 394; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 395; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 396; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 397; GFX10-NEXT: v_mov_b32_e32 v2, 15 398; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 399; GFX10-NEXT: s_waitcnt vmcnt(0) 400; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 401; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0 402; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x104, v1 403; GFX10-NEXT: scratch_store_dword v0, v2, off 404; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 405; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 406; GFX10-NEXT: s_waitcnt vmcnt(0) 407; GFX10-NEXT: s_endpgm 408; 409; GFX940-LABEL: store_load_vindex_small_offset_kernel: 410; GFX940: ; %bb.0: ; %bb 411; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 412; GFX940-NEXT: s_waitcnt vmcnt(0) 413; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 414; GFX940-NEXT: v_mov_b32_e32 v2, 15 415; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 416; GFX940-NEXT: scratch_store_dword v1, v2, off offset:260 sc0 sc1 417; GFX940-NEXT: s_waitcnt vmcnt(0) 418; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 419; GFX940-NEXT: scratch_load_dword v0, v0, off offset:384 sc0 sc1 420; GFX940-NEXT: s_waitcnt vmcnt(0) 421; GFX940-NEXT: s_endpgm 422; 423; GFX11-LABEL: store_load_vindex_small_offset_kernel: 424; GFX11: ; %bb.0: ; %bb 425; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 426; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 427; GFX11-NEXT: v_mov_b32_e32 v2, 15 428; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 429; GFX11-NEXT: s_waitcnt vmcnt(0) 430; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 431; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc 432; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 433; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:384 glc dlc 434; GFX11-NEXT: s_waitcnt vmcnt(0) 435; GFX11-NEXT: s_endpgm 436bb: 437 %padding = alloca [64 x i32], align 4, addrspace(5) 438 %i = alloca [32 x float], align 4, addrspace(5) 439 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 440 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 441 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 442 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 443 %i3 = zext i32 %i2 to i64 444 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 445 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 446 store volatile i32 15, i32 addrspace(5)* %i8, align 4 447 %i9 = sub nsw i32 31, %i2 448 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 449 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 450 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 451 ret void 452} 453 454define void @store_load_vindex_small_offset_foo(i32 %idx) { 455; GFX9-LABEL: store_load_vindex_small_offset_foo: 456; GFX9: ; %bb.0: ; %bb 457; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 458; GFX9-NEXT: scratch_load_dword v1, off, s32 glc 459; GFX9-NEXT: s_waitcnt vmcnt(0) 460; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 461; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 462; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 463; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 464; GFX9-NEXT: v_mov_b32_e32 v2, 15 465; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 466; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x100 467; GFX9-NEXT: scratch_store_dword v1, v2, off 468; GFX9-NEXT: s_waitcnt vmcnt(0) 469; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 470; GFX9-NEXT: scratch_load_dword v0, v0, off glc 471; GFX9-NEXT: s_waitcnt vmcnt(0) 472; GFX9-NEXT: s_setpc_b64 s[30:31] 473; 474; GFX10-LABEL: store_load_vindex_small_offset_foo: 475; GFX10: ; %bb.0: ; %bb 476; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 477; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 478; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 479; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 480; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 481; GFX10-NEXT: v_mov_b32_e32 v2, 15 482; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc 483; GFX10-NEXT: s_waitcnt vmcnt(0) 484; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 485; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 486; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x100 487; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 488; GFX10-NEXT: scratch_store_dword v0, v2, off 489; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 490; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 491; GFX10-NEXT: s_waitcnt vmcnt(0) 492; GFX10-NEXT: s_setpc_b64 s[30:31] 493; 494; GFX940-LABEL: store_load_vindex_small_offset_foo: 495; GFX940: ; %bb.0: ; %bb 496; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 497; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1 498; GFX940-NEXT: s_waitcnt vmcnt(0) 499; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 500; GFX940-NEXT: v_mov_b32_e32 v2, 15 501; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 502; GFX940-NEXT: scratch_store_dword v1, v2, s32 offset:256 sc0 sc1 503; GFX940-NEXT: s_waitcnt vmcnt(0) 504; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 505; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1 506; GFX940-NEXT: s_waitcnt vmcnt(0) 507; GFX940-NEXT: s_setpc_b64 s[30:31] 508; 509; GFX11-LABEL: store_load_vindex_small_offset_foo: 510; GFX11: ; %bb.0: ; %bb 511; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 512; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 513; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 514; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 515; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc 516; GFX11-NEXT: s_waitcnt vmcnt(0) 517; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 518; GFX11-NEXT: scratch_store_b32 v0, v2, s32 offset:256 dlc 519; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 520; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc 521; GFX11-NEXT: s_waitcnt vmcnt(0) 522; GFX11-NEXT: s_setpc_b64 s[30:31] 523bb: 524 %padding = alloca [64 x i32], align 4, addrspace(5) 525 %i = alloca [32 x float], align 4, addrspace(5) 526 %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef 527 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 528 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 529 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 530 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 531 store volatile i32 15, i32 addrspace(5)* %i8, align 4 532 %i9 = and i32 %idx, 15 533 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 534 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 535 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 536 ret void 537} 538 539define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) { 540; GFX9-LABEL: store_load_sindex_large_offset_kernel: 541; GFX9: ; %bb.0: ; %bb 542; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 543; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 544; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 545; GFX9-NEXT: s_mov_b32 vcc_hi, 0 546; GFX9-NEXT: scratch_load_dword v0, off, vcc_hi offset:4 glc 547; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) 548; GFX9-NEXT: s_lshl_b32 s1, s0, 2 549; GFX9-NEXT: s_and_b32 s0, s0, 15 550; GFX9-NEXT: v_mov_b32_e32 v0, 15 551; GFX9-NEXT: s_addk_i32 s1, 0x4004 552; GFX9-NEXT: s_lshl_b32 s0, s0, 2 553; GFX9-NEXT: scratch_store_dword off, v0, s1 554; GFX9-NEXT: s_waitcnt vmcnt(0) 555; GFX9-NEXT: s_addk_i32 s0, 0x4004 556; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 557; GFX9-NEXT: s_waitcnt vmcnt(0) 558; GFX9-NEXT: s_endpgm 559; 560; GFX10-LABEL: store_load_sindex_large_offset_kernel: 561; GFX10: ; %bb.0: ; %bb 562; GFX10-NEXT: s_add_u32 s2, s2, s5 563; GFX10-NEXT: s_addc_u32 s3, s3, 0 564; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 565; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 566; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 567; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc 568; GFX10-NEXT: s_waitcnt vmcnt(0) 569; GFX10-NEXT: v_mov_b32_e32 v0, 15 570; GFX10-NEXT: s_waitcnt lgkmcnt(0) 571; GFX10-NEXT: s_and_b32 s1, s0, 15 572; GFX10-NEXT: s_lshl_b32 s0, s0, 2 573; GFX10-NEXT: s_lshl_b32 s1, s1, 2 574; GFX10-NEXT: s_addk_i32 s0, 0x4004 575; GFX10-NEXT: s_addk_i32 s1, 0x4004 576; GFX10-NEXT: scratch_store_dword off, v0, s0 577; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 578; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc 579; GFX10-NEXT: s_waitcnt vmcnt(0) 580; GFX10-NEXT: s_endpgm 581; 582; GFX940-LABEL: store_load_sindex_large_offset_kernel: 583; GFX940: ; %bb.0: ; %bb 584; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 585; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1 586; GFX940-NEXT: s_waitcnt vmcnt(0) 587; GFX940-NEXT: v_mov_b32_e32 v0, 15 588; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 589; GFX940-NEXT: s_waitcnt lgkmcnt(0) 590; GFX940-NEXT: s_lshl_b32 s1, s0, 2 591; GFX940-NEXT: s_and_b32 s0, s0, 15 592; GFX940-NEXT: v_mov_b32_e32 v1, s1 593; GFX940-NEXT: s_lshl_b32 s0, s0, 2 594; GFX940-NEXT: scratch_store_dword v1, v0, vcc_hi sc0 sc1 595; GFX940-NEXT: s_waitcnt vmcnt(0) 596; GFX940-NEXT: v_mov_b32_e32 v0, s0 597; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 598; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 599; GFX940-NEXT: s_waitcnt vmcnt(0) 600; GFX940-NEXT: s_endpgm 601; 602; GFX11-LABEL: store_load_sindex_large_offset_kernel: 603; GFX11: ; %bb.0: ; %bb 604; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 605; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 606; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 607; GFX11-NEXT: s_waitcnt vmcnt(0) 608; GFX11-NEXT: v_mov_b32_e32 v1, 15 609; GFX11-NEXT: s_waitcnt lgkmcnt(0) 610; GFX11-NEXT: s_lshl_b32 s1, s0, 2 611; GFX11-NEXT: s_and_b32 s0, s0, 15 612; GFX11-NEXT: v_mov_b32_e32 v0, s1 613; GFX11-NEXT: s_lshl_b32 s0, s0, 2 614; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) 615; GFX11-NEXT: v_mov_b32_e32 v2, s0 616; GFX11-NEXT: scratch_store_b32 v0, v1, vcc_lo dlc 617; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 618; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 619; GFX11-NEXT: scratch_load_b32 v0, v2, vcc_lo glc dlc 620; GFX11-NEXT: s_waitcnt vmcnt(0) 621; GFX11-NEXT: s_endpgm 622bb: 623 %padding = alloca [4096 x i32], align 4, addrspace(5) 624 %i = alloca [32 x float], align 4, addrspace(5) 625 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 626 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 627 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 628 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 629 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 630 store volatile i32 15, i32 addrspace(5)* %i8, align 4 631 %i9 = and i32 %idx, 15 632 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 633 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 634 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 635 ret void 636} 637 638define amdgpu_kernel void @store_load_vindex_large_offset_kernel() { 639; GFX9-LABEL: store_load_vindex_large_offset_kernel: 640; GFX9: ; %bb.0: ; %bb 641; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 642; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 643; GFX9-NEXT: s_mov_b32 vcc_hi, 0 644; GFX9-NEXT: scratch_load_dword v1, off, vcc_hi offset:4 glc 645; GFX9-NEXT: s_waitcnt vmcnt(0) 646; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 647; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0 648; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1 649; GFX9-NEXT: v_mov_b32_e32 v2, 15 650; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 651; GFX9-NEXT: scratch_store_dword v1, v2, off 652; GFX9-NEXT: s_waitcnt vmcnt(0) 653; GFX9-NEXT: v_add_u32_e32 v0, 0x4004, v0 654; GFX9-NEXT: scratch_load_dword v0, v0, off offset:124 glc 655; GFX9-NEXT: s_waitcnt vmcnt(0) 656; GFX9-NEXT: s_endpgm 657; 658; GFX10-LABEL: store_load_vindex_large_offset_kernel: 659; GFX10: ; %bb.0: ; %bb 660; GFX10-NEXT: s_add_u32 s0, s0, s3 661; GFX10-NEXT: s_addc_u32 s1, s1, 0 662; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 663; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 664; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0 665; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 666; GFX10-NEXT: v_mov_b32_e32 v2, 15 667; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc 668; GFX10-NEXT: s_waitcnt vmcnt(0) 669; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 670; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0 671; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x4004, v1 672; GFX10-NEXT: scratch_store_dword v0, v2, off 673; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 674; GFX10-NEXT: scratch_load_dword v0, v1, off offset:124 glc dlc 675; GFX10-NEXT: s_waitcnt vmcnt(0) 676; GFX10-NEXT: s_endpgm 677; 678; GFX940-LABEL: store_load_vindex_large_offset_kernel: 679; GFX940: ; %bb.0: ; %bb 680; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1 681; GFX940-NEXT: s_waitcnt vmcnt(0) 682; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 683; GFX940-NEXT: v_mov_b32_e32 v2, 15 684; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 685; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0 686; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 687; GFX940-NEXT: s_waitcnt vmcnt(0) 688; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 689; GFX940-NEXT: s_movk_i32 vcc_hi, 0x4004 690; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi offset:124 sc0 sc1 691; GFX940-NEXT: s_waitcnt vmcnt(0) 692; GFX940-NEXT: s_endpgm 693; 694; GFX11-LABEL: store_load_vindex_large_offset_kernel: 695; GFX11: ; %bb.0: ; %bb 696; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0 697; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 698; GFX11-NEXT: v_mov_b32_e32 v2, 15 699; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 700; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc 701; GFX11-NEXT: s_waitcnt vmcnt(0) 702; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 703; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 704; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 705; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 706; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo offset:124 glc dlc 707; GFX11-NEXT: s_waitcnt vmcnt(0) 708; GFX11-NEXT: s_endpgm 709bb: 710 %padding = alloca [4096 x i32], align 4, addrspace(5) 711 %i = alloca [32 x float], align 4, addrspace(5) 712 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 713 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 714 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 715 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x() 716 %i3 = zext i32 %i2 to i64 717 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2 718 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 719 store volatile i32 15, i32 addrspace(5)* %i8, align 4 720 %i9 = sub nsw i32 31, %i2 721 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 722 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 723 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 724 ret void 725} 726 727define void @store_load_vindex_large_offset_foo(i32 %idx) { 728; GFX9-LABEL: store_load_vindex_large_offset_foo: 729; GFX9: ; %bb.0: ; %bb 730; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 731; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc 732; GFX9-NEXT: s_waitcnt vmcnt(0) 733; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 734; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 735; GFX9-NEXT: v_and_b32_e32 v0, 15, v0 736; GFX9-NEXT: v_add_u32_e32 v1, vcc_hi, v1 737; GFX9-NEXT: v_mov_b32_e32 v2, 15 738; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 739; GFX9-NEXT: s_add_i32 vcc_hi, s32, 0x4004 740; GFX9-NEXT: scratch_store_dword v1, v2, off 741; GFX9-NEXT: s_waitcnt vmcnt(0) 742; GFX9-NEXT: v_add_u32_e32 v0, vcc_hi, v0 743; GFX9-NEXT: scratch_load_dword v0, v0, off glc 744; GFX9-NEXT: s_waitcnt vmcnt(0) 745; GFX9-NEXT: s_setpc_b64 s[30:31] 746; 747; GFX10-LABEL: store_load_vindex_large_offset_foo: 748; GFX10: ; %bb.0: ; %bb 749; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 750; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 751; GFX10-NEXT: v_and_b32_e32 v1, 15, v0 752; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 753; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 754; GFX10-NEXT: v_mov_b32_e32 v2, 15 755; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc 756; GFX10-NEXT: s_waitcnt vmcnt(0) 757; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1 758; GFX10-NEXT: v_add_nc_u32_e32 v0, vcc_lo, v0 759; GFX10-NEXT: s_add_i32 vcc_lo, s32, 0x4004 760; GFX10-NEXT: v_add_nc_u32_e32 v1, vcc_lo, v1 761; GFX10-NEXT: scratch_store_dword v0, v2, off 762; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 763; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc 764; GFX10-NEXT: s_waitcnt vmcnt(0) 765; GFX10-NEXT: s_setpc_b64 s[30:31] 766; 767; GFX940-LABEL: store_load_vindex_large_offset_foo: 768; GFX940: ; %bb.0: ; %bb 769; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 770; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1 771; GFX940-NEXT: s_waitcnt vmcnt(0) 772; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0 773; GFX940-NEXT: v_mov_b32_e32 v2, 15 774; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 775; GFX940-NEXT: v_and_b32_e32 v0, 15, v0 776; GFX940-NEXT: scratch_store_dword v1, v2, vcc_hi sc0 sc1 777; GFX940-NEXT: s_waitcnt vmcnt(0) 778; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0 779; GFX940-NEXT: s_add_i32 vcc_hi, s32, 0x4004 780; GFX940-NEXT: scratch_load_dword v0, v0, vcc_hi sc0 sc1 781; GFX940-NEXT: s_waitcnt vmcnt(0) 782; GFX940-NEXT: s_setpc_b64 s[30:31] 783; 784; GFX11-LABEL: store_load_vindex_large_offset_foo: 785; GFX11: ; %bb.0: ; %bb 786; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 787; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 788; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 789; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 790; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 791; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc 792; GFX11-NEXT: s_waitcnt vmcnt(0) 793; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 794; GFX11-NEXT: scratch_store_b32 v0, v2, vcc_lo dlc 795; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 796; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 797; GFX11-NEXT: scratch_load_b32 v0, v1, vcc_lo glc dlc 798; GFX11-NEXT: s_waitcnt vmcnt(0) 799; GFX11-NEXT: s_setpc_b64 s[30:31] 800bb: 801 %padding = alloca [4096 x i32], align 4, addrspace(5) 802 %i = alloca [32 x float], align 4, addrspace(5) 803 %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef 804 %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4 805 %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)* 806 %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx 807 %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)* 808 store volatile i32 15, i32 addrspace(5)* %i8, align 4 809 %i9 = and i32 %idx, 15 810 %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9 811 %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)* 812 %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4 813 ret void 814} 815 816define amdgpu_kernel void @store_load_large_imm_offset_kernel() { 817; GFX9-LABEL: store_load_large_imm_offset_kernel: 818; GFX9: ; %bb.0: ; %bb 819; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3 820; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0 821; GFX9-NEXT: v_mov_b32_e32 v0, 13 822; GFX9-NEXT: s_mov_b32 vcc_hi, 0 823; GFX9-NEXT: s_movk_i32 s0, 0x3e80 824; GFX9-NEXT: scratch_store_dword off, v0, vcc_hi offset:4 825; GFX9-NEXT: s_waitcnt vmcnt(0) 826; GFX9-NEXT: v_mov_b32_e32 v0, 15 827; GFX9-NEXT: s_add_i32 s0, s0, 4 828; GFX9-NEXT: scratch_store_dword off, v0, s0 829; GFX9-NEXT: s_waitcnt vmcnt(0) 830; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 831; GFX9-NEXT: s_waitcnt vmcnt(0) 832; GFX9-NEXT: s_endpgm 833; 834; GFX10-LABEL: store_load_large_imm_offset_kernel: 835; GFX10: ; %bb.0: ; %bb 836; GFX10-NEXT: s_add_u32 s0, s0, s3 837; GFX10-NEXT: s_addc_u32 s1, s1, 0 838; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 839; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 840; GFX10-NEXT: v_mov_b32_e32 v0, 13 841; GFX10-NEXT: v_mov_b32_e32 v1, 15 842; GFX10-NEXT: s_movk_i32 s0, 0x3e80 843; GFX10-NEXT: s_add_i32 s0, s0, 4 844; GFX10-NEXT: scratch_store_dword off, v0, off offset:4 845; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 846; GFX10-NEXT: scratch_store_dword off, v1, s0 847; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 848; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 849; GFX10-NEXT: s_waitcnt vmcnt(0) 850; GFX10-NEXT: s_endpgm 851; 852; GFX940-LABEL: store_load_large_imm_offset_kernel: 853; GFX940: ; %bb.0: ; %bb 854; GFX940-NEXT: v_mov_b32_e32 v0, 13 855; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1 856; GFX940-NEXT: s_waitcnt vmcnt(0) 857; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80 858; GFX940-NEXT: v_mov_b32_e32 v1, 15 859; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1 860; GFX940-NEXT: s_waitcnt vmcnt(0) 861; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1 862; GFX940-NEXT: s_waitcnt vmcnt(0) 863; GFX940-NEXT: s_endpgm 864; 865; GFX11-LABEL: store_load_large_imm_offset_kernel: 866; GFX11: ; %bb.0: ; %bb 867; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80 868; GFX11-NEXT: v_mov_b32_e32 v2, 15 869; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc 870; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 871; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc 872; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 873; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc 874; GFX11-NEXT: s_waitcnt vmcnt(0) 875; GFX11-NEXT: s_endpgm 876bb: 877 %i = alloca [4096 x i32], align 4, addrspace(5) 878 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 879 store volatile i32 13, i32 addrspace(5)* %i1, align 4 880 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 881 store volatile i32 15, i32 addrspace(5)* %i7, align 4 882 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 883 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 884 ret void 885} 886 887define void @store_load_large_imm_offset_foo() { 888; GFX9-LABEL: store_load_large_imm_offset_foo: 889; GFX9: ; %bb.0: ; %bb 890; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 891; GFX9-NEXT: v_mov_b32_e32 v0, 13 892; GFX9-NEXT: s_movk_i32 s0, 0x3e80 893; GFX9-NEXT: s_add_i32 vcc_hi, s32, 4 894; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4 895; GFX9-NEXT: s_waitcnt vmcnt(0) 896; GFX9-NEXT: v_mov_b32_e32 v0, 15 897; GFX9-NEXT: s_add_i32 s0, s0, vcc_hi 898; GFX9-NEXT: scratch_store_dword off, v0, s0 899; GFX9-NEXT: s_waitcnt vmcnt(0) 900; GFX9-NEXT: scratch_load_dword v0, off, s0 glc 901; GFX9-NEXT: s_waitcnt vmcnt(0) 902; GFX9-NEXT: s_setpc_b64 s[30:31] 903; 904; GFX10-LABEL: store_load_large_imm_offset_foo: 905; GFX10: ; %bb.0: ; %bb 906; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 907; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 908; GFX10-NEXT: v_mov_b32_e32 v0, 13 909; GFX10-NEXT: v_mov_b32_e32 v1, 15 910; GFX10-NEXT: s_movk_i32 s0, 0x3e80 911; GFX10-NEXT: s_add_i32 vcc_lo, s32, 4 912; GFX10-NEXT: s_add_i32 s0, s0, vcc_lo 913; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4 914; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 915; GFX10-NEXT: scratch_store_dword off, v1, s0 916; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 917; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc 918; GFX10-NEXT: s_waitcnt vmcnt(0) 919; GFX10-NEXT: s_setpc_b64 s[30:31] 920; 921; GFX940-LABEL: store_load_large_imm_offset_foo: 922; GFX940: ; %bb.0: ; %bb 923; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 924; GFX940-NEXT: v_mov_b32_e32 v0, 13 925; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1 926; GFX940-NEXT: s_waitcnt vmcnt(0) 927; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80 928; GFX940-NEXT: v_mov_b32_e32 v1, 15 929; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1 930; GFX940-NEXT: s_waitcnt vmcnt(0) 931; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1 932; GFX940-NEXT: s_waitcnt vmcnt(0) 933; GFX940-NEXT: s_setpc_b64 s[30:31] 934; 935; GFX11-LABEL: store_load_large_imm_offset_foo: 936; GFX11: ; %bb.0: ; %bb 937; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 938; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 939; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80 940; GFX11-NEXT: v_mov_b32_e32 v2, 15 941; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc 942; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 943; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc 944; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 945; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc 946; GFX11-NEXT: s_waitcnt vmcnt(0) 947; GFX11-NEXT: s_setpc_b64 s[30:31] 948bb: 949 %i = alloca [4096 x i32], align 4, addrspace(5) 950 %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef 951 store volatile i32 13, i32 addrspace(5)* %i1, align 4 952 %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 953 store volatile i32 15, i32 addrspace(5)* %i7, align 4 954 %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000 955 %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4 956 ret void 957} 958 959define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) { 960; GFX9-LABEL: store_load_vidx_sidx_offset: 961; GFX9: ; %bb.0: ; %bb 962; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24 963; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5 964; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0 965; GFX9-NEXT: v_mov_b32_e32 v1, 15 966; GFX9-NEXT: s_waitcnt lgkmcnt(0) 967; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2 968; GFX9-NEXT: v_add_u32_e32 v0, 4, v0 969; GFX9-NEXT: scratch_store_dword v0, v1, off offset:1024 970; GFX9-NEXT: s_waitcnt vmcnt(0) 971; GFX9-NEXT: scratch_load_dword v0, v0, off offset:1024 glc 972; GFX9-NEXT: s_waitcnt vmcnt(0) 973; GFX9-NEXT: s_endpgm 974; 975; GFX10-LABEL: store_load_vidx_sidx_offset: 976; GFX10: ; %bb.0: ; %bb 977; GFX10-NEXT: s_add_u32 s2, s2, s5 978; GFX10-NEXT: s_addc_u32 s3, s3, 0 979; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 980; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 981; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24 982; GFX10-NEXT: v_mov_b32_e32 v1, 15 983; GFX10-NEXT: s_waitcnt lgkmcnt(0) 984; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2 985; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0 986; GFX10-NEXT: scratch_store_dword v0, v1, off offset:1024 987; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 988; GFX10-NEXT: scratch_load_dword v0, v0, off offset:1024 glc dlc 989; GFX10-NEXT: s_waitcnt vmcnt(0) 990; GFX10-NEXT: s_endpgm 991; 992; GFX940-LABEL: store_load_vidx_sidx_offset: 993; GFX940: ; %bb.0: ; %bb 994; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24 995; GFX940-NEXT: v_mov_b32_e32 v1, 15 996; GFX940-NEXT: s_waitcnt lgkmcnt(0) 997; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2 998; GFX940-NEXT: scratch_store_dword v0, v1, off offset:1028 sc0 sc1 999; GFX940-NEXT: s_waitcnt vmcnt(0) 1000; GFX940-NEXT: scratch_load_dword v0, v0, off offset:1028 sc0 sc1 1001; GFX940-NEXT: s_waitcnt vmcnt(0) 1002; GFX940-NEXT: s_endpgm 1003; 1004; GFX11-LABEL: store_load_vidx_sidx_offset: 1005; GFX11: ; %bb.0: ; %bb 1006; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24 1007; GFX11-NEXT: v_mov_b32_e32 v1, 15 1008; GFX11-NEXT: s_waitcnt lgkmcnt(0) 1009; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2 1010; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:1028 dlc 1011; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1012; GFX11-NEXT: scratch_load_b32 v0, v0, off offset:1028 glc dlc 1013; GFX11-NEXT: s_waitcnt vmcnt(0) 1014; GFX11-NEXT: s_endpgm 1015bb: 1016 %alloca = alloca [32 x i32], align 4, addrspace(5) 1017 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x() 1018 %add1 = add nsw i32 %sidx, %vidx 1019 %add2 = add nsw i32 %add1, 256 1020 %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2 1021 store volatile i32 15, i32 addrspace(5)* %gep, align 4 1022 %load = load volatile i32, i32 addrspace(5)* %gep, align 4 1023 ret void 1024} 1025 1026define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) { 1027; GFX9-LABEL: store_load_i64_aligned: 1028; GFX9: ; %bb.0: ; %bb 1029; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1030; GFX9-NEXT: v_mov_b32_e32 v1, 15 1031; GFX9-NEXT: v_mov_b32_e32 v2, 0 1032; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1033; GFX9-NEXT: s_waitcnt vmcnt(0) 1034; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 1035; GFX9-NEXT: s_waitcnt vmcnt(0) 1036; GFX9-NEXT: s_setpc_b64 s[30:31] 1037; 1038; GFX10-LABEL: store_load_i64_aligned: 1039; GFX10: ; %bb.0: ; %bb 1040; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1041; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1042; GFX10-NEXT: v_mov_b32_e32 v1, 15 1043; GFX10-NEXT: v_mov_b32_e32 v2, 0 1044; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1045; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1046; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 1047; GFX10-NEXT: s_waitcnt vmcnt(0) 1048; GFX10-NEXT: s_setpc_b64 s[30:31] 1049; 1050; GFX940-LABEL: store_load_i64_aligned: 1051; GFX940: ; %bb.0: ; %bb 1052; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1053; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 1054; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 1055; GFX940-NEXT: s_waitcnt vmcnt(0) 1056; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 1057; GFX940-NEXT: s_waitcnt vmcnt(0) 1058; GFX940-NEXT: s_setpc_b64 s[30:31] 1059; 1060; GFX11-LABEL: store_load_i64_aligned: 1061; GFX11: ; %bb.0: ; %bb 1062; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1063; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1064; GFX11-NEXT: v_mov_b32_e32 v1, 15 1065; GFX11-NEXT: v_mov_b32_e32 v2, 0 1066; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 1067; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1068; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 1069; GFX11-NEXT: s_waitcnt vmcnt(0) 1070; GFX11-NEXT: s_setpc_b64 s[30:31] 1071bb: 1072 store volatile i64 15, i64 addrspace(5)* %arg, align 8 1073 %load = load volatile i64, i64 addrspace(5)* %arg, align 8 1074 ret void 1075} 1076 1077define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) { 1078; GFX9-LABEL: store_load_i64_unaligned: 1079; GFX9: ; %bb.0: ; %bb 1080; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1081; GFX9-NEXT: v_mov_b32_e32 v1, 15 1082; GFX9-NEXT: v_mov_b32_e32 v2, 0 1083; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1084; GFX9-NEXT: s_waitcnt vmcnt(0) 1085; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc 1086; GFX9-NEXT: s_waitcnt vmcnt(0) 1087; GFX9-NEXT: s_setpc_b64 s[30:31] 1088; 1089; GFX10-LABEL: store_load_i64_unaligned: 1090; GFX10: ; %bb.0: ; %bb 1091; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1092; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1093; GFX10-NEXT: v_mov_b32_e32 v1, 15 1094; GFX10-NEXT: v_mov_b32_e32 v2, 0 1095; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off 1096; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1097; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc 1098; GFX10-NEXT: s_waitcnt vmcnt(0) 1099; GFX10-NEXT: s_setpc_b64 s[30:31] 1100; 1101; GFX940-LABEL: store_load_i64_unaligned: 1102; GFX940: ; %bb.0: ; %bb 1103; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1104; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 1105; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 1106; GFX940-NEXT: s_waitcnt vmcnt(0) 1107; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 1108; GFX940-NEXT: s_waitcnt vmcnt(0) 1109; GFX940-NEXT: s_setpc_b64 s[30:31] 1110; 1111; GFX11-LABEL: store_load_i64_unaligned: 1112; GFX11: ; %bb.0: ; %bb 1113; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1114; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1115; GFX11-NEXT: v_mov_b32_e32 v1, 15 1116; GFX11-NEXT: v_mov_b32_e32 v2, 0 1117; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc 1118; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1119; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc 1120; GFX11-NEXT: s_waitcnt vmcnt(0) 1121; GFX11-NEXT: s_setpc_b64 s[30:31] 1122bb: 1123 store volatile i64 15, i64 addrspace(5)* %arg, align 1 1124 %load = load volatile i64, i64 addrspace(5)* %arg, align 1 1125 ret void 1126} 1127 1128define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg) { 1129; GFX9-LABEL: store_load_v3i32_unaligned: 1130; GFX9: ; %bb.0: ; %bb 1131; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1132; GFX9-NEXT: s_mov_b32 s2, 3 1133; GFX9-NEXT: s_mov_b32 s1, 2 1134; GFX9-NEXT: s_mov_b32 s0, 1 1135; GFX9-NEXT: v_mov_b32_e32 v3, s2 1136; GFX9-NEXT: v_mov_b32_e32 v2, s1 1137; GFX9-NEXT: v_mov_b32_e32 v1, s0 1138; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off 1139; GFX9-NEXT: s_waitcnt vmcnt(0) 1140; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc 1141; GFX9-NEXT: s_waitcnt vmcnt(0) 1142; GFX9-NEXT: s_setpc_b64 s[30:31] 1143; 1144; GFX10-LABEL: store_load_v3i32_unaligned: 1145; GFX10: ; %bb.0: ; %bb 1146; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1147; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1148; GFX10-NEXT: s_mov_b32 s2, 3 1149; GFX10-NEXT: s_mov_b32 s1, 2 1150; GFX10-NEXT: s_mov_b32 s0, 1 1151; GFX10-NEXT: v_mov_b32_e32 v3, s2 1152; GFX10-NEXT: v_mov_b32_e32 v2, s1 1153; GFX10-NEXT: v_mov_b32_e32 v1, s0 1154; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off 1155; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1156; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc 1157; GFX10-NEXT: s_waitcnt vmcnt(0) 1158; GFX10-NEXT: s_setpc_b64 s[30:31] 1159; 1160; GFX940-LABEL: store_load_v3i32_unaligned: 1161; GFX940: ; %bb.0: ; %bb 1162; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1163; GFX940-NEXT: s_mov_b32 s2, 3 1164; GFX940-NEXT: s_mov_b32 s1, 2 1165; GFX940-NEXT: s_mov_b32 s0, 1 1166; GFX940-NEXT: v_mov_b32_e32 v4, s2 1167; GFX940-NEXT: v_mov_b32_e32 v3, s1 1168; GFX940-NEXT: v_mov_b32_e32 v2, s0 1169; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 1170; GFX940-NEXT: s_waitcnt vmcnt(0) 1171; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 1172; GFX940-NEXT: s_waitcnt vmcnt(0) 1173; GFX940-NEXT: s_setpc_b64 s[30:31] 1174; 1175; GFX11-LABEL: store_load_v3i32_unaligned: 1176; GFX11: ; %bb.0: ; %bb 1177; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1178; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1179; GFX11-NEXT: s_mov_b32 s2, 3 1180; GFX11-NEXT: s_mov_b32 s1, 2 1181; GFX11-NEXT: s_mov_b32 s0, 1 1182; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 1183; GFX11-NEXT: v_mov_b32_e32 v1, s0 1184; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc 1185; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1186; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc 1187; GFX11-NEXT: s_waitcnt vmcnt(0) 1188; GFX11-NEXT: s_setpc_b64 s[30:31] 1189bb: 1190 store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1 1191 %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1 1192 ret void 1193} 1194 1195define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg) { 1196; GFX9-LABEL: store_load_v4i32_unaligned: 1197; GFX9: ; %bb.0: ; %bb 1198; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1199; GFX9-NEXT: s_mov_b32 s3, 4 1200; GFX9-NEXT: s_mov_b32 s2, 3 1201; GFX9-NEXT: s_mov_b32 s1, 2 1202; GFX9-NEXT: s_mov_b32 s0, 1 1203; GFX9-NEXT: v_mov_b32_e32 v4, s3 1204; GFX9-NEXT: v_mov_b32_e32 v3, s2 1205; GFX9-NEXT: v_mov_b32_e32 v2, s1 1206; GFX9-NEXT: v_mov_b32_e32 v1, s0 1207; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off 1208; GFX9-NEXT: s_waitcnt vmcnt(0) 1209; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc 1210; GFX9-NEXT: s_waitcnt vmcnt(0) 1211; GFX9-NEXT: s_setpc_b64 s[30:31] 1212; 1213; GFX10-LABEL: store_load_v4i32_unaligned: 1214; GFX10: ; %bb.0: ; %bb 1215; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1216; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1217; GFX10-NEXT: s_mov_b32 s3, 4 1218; GFX10-NEXT: s_mov_b32 s2, 3 1219; GFX10-NEXT: s_mov_b32 s1, 2 1220; GFX10-NEXT: s_mov_b32 s0, 1 1221; GFX10-NEXT: v_mov_b32_e32 v4, s3 1222; GFX10-NEXT: v_mov_b32_e32 v3, s2 1223; GFX10-NEXT: v_mov_b32_e32 v2, s1 1224; GFX10-NEXT: v_mov_b32_e32 v1, s0 1225; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off 1226; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 1227; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc 1228; GFX10-NEXT: s_waitcnt vmcnt(0) 1229; GFX10-NEXT: s_setpc_b64 s[30:31] 1230; 1231; GFX940-LABEL: store_load_v4i32_unaligned: 1232; GFX940: ; %bb.0: ; %bb 1233; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1234; GFX940-NEXT: s_mov_b32 s3, 4 1235; GFX940-NEXT: s_mov_b32 s2, 3 1236; GFX940-NEXT: s_mov_b32 s1, 2 1237; GFX940-NEXT: s_mov_b32 s0, 1 1238; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] 1239; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] 1240; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 1241; GFX940-NEXT: s_waitcnt vmcnt(0) 1242; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 1243; GFX940-NEXT: s_waitcnt vmcnt(0) 1244; GFX940-NEXT: s_setpc_b64 s[30:31] 1245; 1246; GFX11-LABEL: store_load_v4i32_unaligned: 1247; GFX11: ; %bb.0: ; %bb 1248; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1249; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1250; GFX11-NEXT: s_mov_b32 s3, 4 1251; GFX11-NEXT: s_mov_b32 s2, 3 1252; GFX11-NEXT: s_mov_b32 s1, 2 1253; GFX11-NEXT: s_mov_b32 s0, 1 1254; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 1255; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 1256; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc 1257; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 1258; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc 1259; GFX11-NEXT: s_waitcnt vmcnt(0) 1260; GFX11-NEXT: s_setpc_b64 s[30:31] 1261bb: 1262 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1 1263 %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1 1264 ret void 1265} 1266 1267declare i32 @llvm.amdgcn.workitem.id.x() 1268