1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s 2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s 3 4; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast: 5; HSA: enable_sgpr_private_segment_buffer = 1 6; HSA: enable_sgpr_dispatch_ptr = 0 7; CI: enable_sgpr_queue_ptr = 1 8; GFX9: enable_sgpr_queue_ptr = 0 9 10; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 11; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}} 12; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 13; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 14; CI-DAG: s_cselect_b64 vcc, -1, 0 15; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 16; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 17; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 18 19; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 20; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 21; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 22; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 23; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] 24 25; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base 26; GFX9: s_cmp_lg_u32 [[PTR]], -1 27; GFX9: s_cselect_b64 vcc, -1, 0 28; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 29; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 30; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 31 32; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] 33 34; At most 2 digits. Make sure src_shared_base is not counted as a high 35; number SGPR. 36 37; CI: NumSgprs: {{[0-9][0-9]+}} 38; GFX9: NumSgprs: {{[0-9]+}} 39define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 { 40 %stof = addrspacecast i32 addrspace(3)* %ptr to i32* 41 store volatile i32 7, i32* %stof 42 ret void 43} 44 45; Test handling inside a non-kernel 46; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func: 47; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}} 48; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 49; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 50; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 51; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0 52 53; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 54; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 55; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 56; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]] 57 58; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base 59; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0 60; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc 61; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 62 63; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] 64define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 { 65 %stof = addrspacecast i32 addrspace(3)* %ptr to i32* 66 store volatile i32 7, i32* %stof 67 ret void 68} 69 70; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast: 71; HSA: enable_sgpr_private_segment_buffer = 1 72; HSA: enable_sgpr_dispatch_ptr = 0 73; CI: enable_sgpr_queue_ptr = 1 74; GFX9: enable_sgpr_queue_ptr = 0 75 76; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}} 77; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}} 78; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]] 79 80; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 81; CI-DAG: s_cmp_lg_u32 [[PTR]], -1 82; CI-DAG: s_cselect_b64 vcc, -1, 0 83; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 84; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 85; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 86 87; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}} 88; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) 89; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16 90; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]] 91 92; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base 93 94; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 95; GFX9: s_cmp_lg_u32 [[PTR]], -1 96; GFX9: s_cselect_b64 vcc, -1, 0 97; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc 98; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]] 99; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]] 100 101; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]] 102 103; CI: NumSgprs: {{[0-9][0-9]+}} 104; GFX9: NumSgprs: {{[0-9]+}} 105define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 { 106 %stof = addrspacecast i32 addrspace(5)* %ptr to i32* 107 store volatile i32 7, i32* %stof 108 ret void 109} 110 111; no-op 112; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast: 113; HSA: enable_sgpr_queue_ptr = 0 114 115; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] 116; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 117; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 118; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7 119; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]] 120define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 { 121 %stof = addrspacecast i32 addrspace(1)* %ptr to i32* 122 store volatile i32 7, i32* %stof 123 ret void 124} 125 126; no-op 127; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast: 128; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] 129; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 130; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 131; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]] 132define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 { 133 %stof = addrspacecast i32 addrspace(4)* %ptr to i32* 134 %ld = load volatile i32, i32* %stof 135 ret void 136} 137 138; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast: 139; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]] 140; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 141; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 142; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]] 143 144; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} 145; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s[[[PTRLO]]:[[PTRHI]]] 146define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 { 147 %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)* 148 %ld = load volatile i32, i32 addrspace(1)* %stof 149 ret void 150} 151 152; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast: 153; HSA: enable_sgpr_private_segment_buffer = 1 154; HSA: enable_sgpr_dispatch_ptr = 0 155; HSA: enable_sgpr_queue_ptr = 0 156 157; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]] 158; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} 159; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 160; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 161; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 162; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0 163; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1 164; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]] 165; HSA: ds_write_b32 [[CASTPTR]], v[[K]] 166define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { 167 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)* 168 store volatile i32 0, i32 addrspace(3)* %ftos 169 ret void 170} 171 172; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast: 173; HSA: enable_sgpr_private_segment_buffer = 1 174; HSA: enable_sgpr_dispatch_ptr = 0 175; HSA: enable_sgpr_queue_ptr = 0 176 177; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]] 178; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}} 179; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] 180; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]] 181; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} 182; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0 183; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1 184; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]] 185; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} 186define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { 187 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* 188 store volatile i32 0, i32 addrspace(5)* %ftos 189 ret void 190} 191 192; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast: 193; HSA: enable_sgpr_queue_ptr = 0 194 195; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0 196; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]] 197; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]] 198; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0 199; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]] 200 201; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 202; GFX9: global_store_dword [[ZERO]], [[ZERO]], s[[[PTRLO]]:[[PTRHI]]{{\]$}} 203define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 { 204 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)* 205 store volatile i32 0, i32 addrspace(1)* %ftos 206 ret void 207} 208 209; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast: 210; HSA: enable_sgpr_queue_ptr = 0 211 212; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0 213; HSA: s_load_dword s{{[0-9]+}}, s[[[PTRLO]]:[[PTRHI]]], 0x0 214define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 { 215 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)* 216 load volatile i32, i32 addrspace(4)* %ftos 217 ret void 218} 219 220; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast: 221; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10 222; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 223; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16) 224; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 225; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] 226 227; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base 228 229; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 230; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 231; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] 232define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 { 233 %cast = addrspacecast i32 addrspace(3)* null to i32* 234 store volatile i32 7, i32* %cast 235 ret void 236} 237 238; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast: 239; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 240; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 241; HSA: ds_write_b32 [[PTR]], [[K]] 242define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 { 243 %cast = addrspacecast i32* null to i32 addrspace(3)* 244 store volatile i32 7, i32 addrspace(3)* %cast 245 ret void 246} 247 248; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast: 249; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 250; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 251; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 252; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] 253define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 { 254 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32* 255 store volatile i32 7, i32* %cast 256 ret void 257} 258 259; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast: 260; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 261; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 262; HSA: ds_write_b32 [[PTR]], [[K]] 263define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 { 264 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)* 265 store volatile i32 7, i32 addrspace(3)* %cast 266 ret void 267} 268 269; FIXME: Shouldn't need to enable queue ptr 270; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast: 271; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11 272; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]] 273; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16) 274; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16 275; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]] 276 277; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base 278 279; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 280; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 281; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] 282define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { 283 %cast = addrspacecast i32 addrspace(5)* null to i32* 284 store volatile i32 7, i32* %cast 285 ret void 286} 287 288; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: 289; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 290; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 291; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 292define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { 293 %cast = addrspacecast i32* null to i32 addrspace(5)* 294 store volatile i32 7, i32 addrspace(5)* %cast 295 ret void 296} 297 298 299; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast: 300; CI: enable_sgpr_queue_ptr = 1 301; GFX9: enable_sgpr_queue_ptr = 0 302 303; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}} 304; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}} 305; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}} 306; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]] 307define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 { 308 %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32* 309 store volatile i32 7, i32* %cast 310 ret void 311} 312 313; HSA-LABEL: {{^}}cast_neg1_flat_to_private_addrspacecast: 314; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}} 315; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} 316; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 317define amdgpu_kernel void @cast_neg1_flat_to_private_addrspacecast() #0 { 318 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(5)* 319 store volatile i32 7, i32 addrspace(5)* %cast 320 ret void 321} 322 323 324; Disable optimizations in case there are optimizations added that 325; specialize away generic pointer accesses. 326 327; HSA-LABEL: {{^}}branch_use_flat_i32: 328; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} 329; HSA: s_endpgm 330define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 { 331entry: 332 %cmp = icmp ne i32 %c, 0 333 br i1 %cmp, label %local, label %global 334 335local: 336 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32* 337 br label %end 338 339global: 340 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32* 341 br label %end 342 343end: 344 %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ] 345 store volatile i32 %x, i32* %fptr, align 4 346; %val = load i32, i32* %fptr, align 4 347; store i32 %val, i32 addrspace(1)* %out, align 4 348 ret void 349} 350 351; Check for prologue initializing special SGPRs pointing to scratch. 352; HSA-LABEL: {{^}}store_flat_scratch: 353; CI-DAG: s_mov_b32 flat_scratch_lo, s9 354; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11 355; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8 356 357; GFX9: s_add_u32 flat_scratch_lo, s6, s9 358; GFX9: s_addc_u32 flat_scratch_hi, s7, 0 359 360; HSA: {{flat|global}}_store_dword 361; HSA: s_barrier 362; HSA: {{flat|global}}_load_dword 363define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 { 364 %alloca = alloca i32, i32 9, align 4, addrspace(5) 365 %x = call i32 @llvm.amdgcn.workitem.id.x() #2 366 %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x 367 %fptr = addrspacecast i32 addrspace(5)* %pptr to i32* 368 store volatile i32 %x, i32* %fptr 369 ; Dummy call 370 call void @llvm.amdgcn.s.barrier() #1 371 %reload = load volatile i32, i32* %fptr, align 4 372 store volatile i32 %reload, i32 addrspace(1)* %out, align 4 373 ret void 374} 375 376; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast 377; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} 378; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}} 379; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}} 380; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}} 381; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]] 382; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} 383define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 { 384 %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr 385 %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)* 386 %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset 387 %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)* 388 %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4 389 ret void 390} 391 392; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast 393; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}} 394; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}} 395; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}} 396; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}} 397; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]] 398; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}} 399define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 { 400 %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr 401 %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)* 402 %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset 403 %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)* 404 %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4 405 ret void 406} 407 408; GCN-LABEL: {{^}}use_constant32bit_to_flat_addrspacecast_0: 409; GCN: s_load_dword [[PTR:s[0-9]+]], 410; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0 411; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]] 412; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]] 413define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_0(i32 addrspace(6)* %ptr) #0 { 414 %stof = addrspacecast i32 addrspace(6)* %ptr to i32* 415 %load = load volatile i32, i32* %stof 416 ret void 417} 418 419; GCN-LABEL: {{^}}use_constant32bit_to_flat_addrspacecast_1: 420; GCN: s_load_dword [[PTR:s[0-9]+]], 421; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0xffff8000 422; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]] 423; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]] 424define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_1(i32 addrspace(6)* %ptr) #3 { 425 %stof = addrspacecast i32 addrspace(6)* %ptr to i32* 426 %load = load volatile i32, i32* %stof 427 ret void 428} 429 430declare void @llvm.amdgcn.s.barrier() #1 431declare i32 @llvm.amdgcn.workitem.id.x() #2 432 433attributes #0 = { nounwind } 434attributes #1 = { nounwind convergent } 435attributes #2 = { nounwind readnone } 436attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" } 437