1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=2 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
3
4; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
5; HSA: enable_sgpr_private_segment_buffer = 1
6; HSA: enable_sgpr_dispatch_ptr = 0
7; CI: enable_sgpr_queue_ptr = 1
8; GFX9: enable_sgpr_queue_ptr = 0
9
10; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
11; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
12; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
13; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
14; CI-DAG: s_cselect_b64 vcc, -1, 0
15; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
16; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
17; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
18
19; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
20; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
21; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
22; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
23; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
24
25; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
26; GFX9: s_cmp_lg_u32 [[PTR]], -1
27; GFX9: s_cselect_b64 vcc, -1, 0
28; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
29; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
30; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
31
32; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
33
34; At most 2 digits. Make sure src_shared_base is not counted as a high
35; number SGPR.
36
37; CI: NumSgprs: {{[0-9][0-9]+}}
38; GFX9: NumSgprs: {{[0-9]+}}
39define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
40  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
41  store volatile i32 7, i32* %stof
42  ret void
43}
44
45; Test handling inside a non-kernel
46; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
47; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[6:7], 0x10{{$}}
48; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
49; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
50; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
51; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
52
53; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
54; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
55; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
56; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
57
58; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
59; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
60; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
61; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
62
63; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
64define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
65  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
66  store volatile i32 7, i32* %stof
67  ret void
68}
69
70; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
71; HSA: enable_sgpr_private_segment_buffer = 1
72; HSA: enable_sgpr_dispatch_ptr = 0
73; CI: enable_sgpr_queue_ptr = 1
74; GFX9: enable_sgpr_queue_ptr = 0
75
76; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
77; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
78; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
79
80; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
81; CI-DAG: s_cmp_lg_u32 [[PTR]], -1
82; CI-DAG: s_cselect_b64 vcc, -1, 0
83; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
84; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
85; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
86
87; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
88; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
89; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
90; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
91
92; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
93
94; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
95; GFX9: s_cmp_lg_u32 [[PTR]], -1
96; GFX9: s_cselect_b64 vcc, -1, 0
97; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
98; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
99; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
100
101; HSA: flat_store_dword v[[[LO]]:[[HI]]], [[K]]
102
103; CI: NumSgprs: {{[0-9][0-9]+}}
104; GFX9: NumSgprs: {{[0-9]+}}
105define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
106  %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
107  store volatile i32 7, i32* %stof
108  ret void
109}
110
111; no-op
112; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
113; HSA: enable_sgpr_queue_ptr = 0
114
115; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
116; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
117; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
118; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
119; HSA: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
120define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
121  %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
122  store volatile i32 7, i32* %stof
123  ret void
124}
125
126; no-op
127; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
128; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
129; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
130; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
131; HSA: flat_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
132define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
133  %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
134  %ld = load volatile i32, i32* %stof
135  ret void
136}
137
138; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
139; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]]
140; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
141; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
142; CI: {{flat|global}}_load_dword v{{[0-9]+}}, v[[[VPTRLO]]:[[VPTRHI]]]
143
144; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
145; GFX9: global_load_dword v{{[0-9]+}}, [[ZERO:v[0-9]+]], s[[[PTRLO]]:[[PTRHI]]]
146define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
147  %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
148  %ld = load volatile i32, i32 addrspace(1)* %stof
149  ret void
150}
151
152; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
153; HSA: enable_sgpr_private_segment_buffer = 1
154; HSA: enable_sgpr_dispatch_ptr = 0
155; HSA: enable_sgpr_queue_ptr = 0
156
157; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
158; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
159; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
160; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
161; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
162; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
163; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
164; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
165; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
166define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
167  %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
168  store volatile i32 0, i32 addrspace(3)* %ftos
169  ret void
170}
171
172; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
173; HSA: enable_sgpr_private_segment_buffer = 1
174; HSA: enable_sgpr_dispatch_ptr = 0
175; HSA: enable_sgpr_queue_ptr = 0
176
177; HSA: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]]
178; CI-DAG: v_cmp_ne_u64_e64 vcc, s[[[PTR_LO]]:[[PTR_HI]]], 0{{$}}
179; CI-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
180; CI-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
181; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
182; GFX9-DAG: s_cmp_lg_u64 s[[[CMP_LO:[0-9]+]]:[[CMP_HI:[0-9]+]]], 0
183; GFX9-DAG: s_cselect_b32 s[[PTR_LO]], s[[PTR_LO]], -1
184; GFX9-DAG: v_mov_b32_e32 [[CASTPTR:v[0-9]+]], s[[PTR_LO]]
185; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
186define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
187  %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
188  store volatile i32 0, i32 addrspace(5)* %ftos
189  ret void
190}
191
192; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
193; HSA: enable_sgpr_queue_ptr = 0
194
195; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0
196; CI-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
197; CI-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
198; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
199; CI: flat_store_dword v[[[VPTRLO]]:[[VPTRHI]]], [[K]]
200
201; GFX9: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
202; GFX9: global_store_dword [[ZERO]], [[ZERO]], s[[[PTRLO]]:[[PTRHI]]{{\]$}}
203define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
204  %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
205  store volatile i32 0, i32 addrspace(1)* %ftos
206  ret void
207}
208
209; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
210; HSA: enable_sgpr_queue_ptr = 0
211
212; HSA: s_load_dwordx2 s[[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]], s[4:5], 0x0
213; HSA: s_load_dword s{{[0-9]+}}, s[[[PTRLO]]:[[PTRHI]]], 0x0
214define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
215  %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
216  load volatile i32, i32 addrspace(4)* %ftos
217  ret void
218}
219
220; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
221; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
222; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
223; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
224; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
225; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
226
227; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
228
229; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
230; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
231; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
232define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
233  %cast = addrspacecast i32 addrspace(3)* null to i32*
234  store volatile i32 7, i32* %cast
235  ret void
236}
237
238; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
239; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
240; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
241; HSA: ds_write_b32 [[PTR]], [[K]]
242define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
243  %cast = addrspacecast i32* null to i32 addrspace(3)*
244  store volatile i32 7, i32 addrspace(3)* %cast
245  ret void
246}
247
248; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
249; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
250; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
251; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
252; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
253define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
254  %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
255  store volatile i32 7, i32* %cast
256  ret void
257}
258
259; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
260; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
261; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
262; HSA: ds_write_b32 [[PTR]], [[K]]
263define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
264  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)*
265  store volatile i32 7, i32 addrspace(3)* %cast
266  ret void
267}
268
269; FIXME: Shouldn't need to enable queue ptr
270; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
271; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
272; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
273; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
274; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
275; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
276
277; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
278
279; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
280; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
281; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
282define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
283  %cast = addrspacecast i32 addrspace(5)* null to i32*
284  store volatile i32 7, i32* %cast
285  ret void
286}
287
288; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
289; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
290; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
291; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
292define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
293  %cast = addrspacecast i32* null to i32 addrspace(5)*
294  store volatile i32 7, i32 addrspace(5)* %cast
295  ret void
296}
297
298
299; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
300; CI: enable_sgpr_queue_ptr = 1
301; GFX9: enable_sgpr_queue_ptr = 0
302
303; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
304; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
305; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
306; HSA: {{flat|global}}_store_dword v[[[LO]]:[[HI]]], v[[K]]
307define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
308  %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*
309  store volatile i32 7, i32* %cast
310  ret void
311}
312
313; HSA-LABEL: {{^}}cast_neg1_flat_to_private_addrspacecast:
314; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
315; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
316; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
317define amdgpu_kernel void @cast_neg1_flat_to_private_addrspacecast() #0 {
318  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(5)*
319  store volatile i32 7, i32 addrspace(5)* %cast
320  ret void
321}
322
323
324; Disable optimizations in case there are optimizations added that
325; specialize away generic pointer accesses.
326
327; HSA-LABEL: {{^}}branch_use_flat_i32:
328; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
329; HSA: s_endpgm
330define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
331entry:
332  %cmp = icmp ne i32 %c, 0
333  br i1 %cmp, label %local, label %global
334
335local:
336  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32*
337  br label %end
338
339global:
340  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32*
341  br label %end
342
343end:
344  %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ]
345  store volatile i32 %x, i32* %fptr, align 4
346;  %val = load i32, i32* %fptr, align 4
347;  store i32 %val, i32 addrspace(1)* %out, align 4
348  ret void
349}
350
351; Check for prologue initializing special SGPRs pointing to scratch.
352; HSA-LABEL: {{^}}store_flat_scratch:
353; CI-DAG: s_mov_b32 flat_scratch_lo, s9
354; CI-DAG: s_add_i32 [[ADD:s[0-9]+]], s8, s11
355; CI-DAG: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
356
357; GFX9: s_add_u32 flat_scratch_lo, s6, s9
358; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
359
360; HSA: {{flat|global}}_store_dword
361; HSA: s_barrier
362; HSA: {{flat|global}}_load_dword
363define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
364  %alloca = alloca i32, i32 9, align 4, addrspace(5)
365  %x = call i32 @llvm.amdgcn.workitem.id.x() #2
366  %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x
367  %fptr = addrspacecast i32 addrspace(5)* %pptr to i32*
368  store volatile i32 %x, i32* %fptr
369  ; Dummy call
370  call void @llvm.amdgcn.s.barrier() #1
371  %reload = load volatile i32, i32* %fptr, align 4
372  store volatile i32 %reload, i32 addrspace(1)* %out, align 4
373  ret void
374}
375
376; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
377; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
378; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
379; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
380; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
381; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
382; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
383define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
384  %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr
385  %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)*
386  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
387  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
388  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
389  ret void
390}
391
392; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
393; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
394; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
395; GFX9: s_load_dwordx2 s[[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]], [[PTRPTR]], 0x0{{$}}
396; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
397; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
398; GFX9: s_load_dword s{{[0-9]+}}, s[[[PTR_LO]]:[[PTR_HI]]], 0x0{{$}}
399define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
400  %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr
401  %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)*
402  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
403  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
404  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
405  ret void
406}
407
408; GCN-LABEL: {{^}}use_constant32bit_to_flat_addrspacecast_0:
409; GCN: s_load_dword [[PTR:s[0-9]+]],
410; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0
411; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]]
412; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]]
413define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_0(i32 addrspace(6)* %ptr) #0 {
414  %stof = addrspacecast i32 addrspace(6)* %ptr to i32*
415  %load = load volatile i32, i32* %stof
416  ret void
417}
418
419; GCN-LABEL: {{^}}use_constant32bit_to_flat_addrspacecast_1:
420; GCN: s_load_dword [[PTR:s[0-9]+]],
421; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0xffff8000
422; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], [[PTR]]
423; GCN: flat_load_dword v{{[0-9]+}}, v[[[LO]]:[[HI]]]
424define amdgpu_kernel void @use_constant32bit_to_flat_addrspacecast_1(i32 addrspace(6)* %ptr) #3 {
425  %stof = addrspacecast i32 addrspace(6)* %ptr to i32*
426  %load = load volatile i32, i32* %stof
427  ret void
428}
429
430declare void @llvm.amdgcn.s.barrier() #1
431declare i32 @llvm.amdgcn.workitem.id.x() #2
432
433attributes #0 = { nounwind }
434attributes #1 = { nounwind convergent }
435attributes #2 = { nounwind readnone }
436attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" }
437