1; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=CI %s
2; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-code-object-v3,-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=HSA -check-prefix=GFX9 %s
3
4; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
5; HSA: enable_sgpr_private_segment_buffer = 1
6; HSA: enable_sgpr_dispatch_ptr = 0
7; CI: enable_sgpr_queue_ptr = 1
8; GFX9: enable_sgpr_queue_ptr = 0
9
10; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
11; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
12; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
13; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
14; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
15; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
16; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
17
18; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
19; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
20; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
21; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
22; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
23
24; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
25; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
26; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
27; GFX9-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
28; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
29
30; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
31
32; At most 2 digits. Make sure src_shared_base is not counted as a high
33; number SGPR.
34
35; CI: NumSgprs: {{[0-9][0-9]+}}
36; GFX9: NumSgprs: {{[0-9]+}}
37define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
38  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
39  store volatile i32 7, i32* %stof
40  ret void
41}
42
43; Test handling inside a non-kernel
44; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast_func:
45; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
46; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
47; CI-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
48; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
49; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0
50
51; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
52; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
53; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
54; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
55
56; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
57; GFX9-DAG: v_cmp_ne_u32_e32 vcc, -1, v0
58; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, v0, vcc
59; GFX9-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
60
61; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
62define void @use_group_to_flat_addrspacecast_func(i32 addrspace(3)* %ptr) #0 {
63  %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
64  store volatile i32 7, i32* %stof
65  ret void
66}
67
68; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
69; HSA: enable_sgpr_private_segment_buffer = 1
70; HSA: enable_sgpr_dispatch_ptr = 0
71; CI: enable_sgpr_queue_ptr = 1
72; GFX9: enable_sgpr_queue_ptr = 0
73
74; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
75; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
76; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
77
78; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
79; CI-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
80; CI-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
81; CI-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
82; CI-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
83
84; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
85; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
86; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
87; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
88
89; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
90
91; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
92; GFX9: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
93; GFX9: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]], vcc
94; GFX9: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
95; GFX9-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
96
97; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
98
99; CI: NumSgprs: {{[0-9][0-9]+}}
100; GFX9: NumSgprs: {{[0-9]+}}
101define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
102  %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
103  store volatile i32 7, i32* %stof
104  ret void
105}
106
107; no-op
108; HSA-LABEL: {{^}}use_global_to_flat_addrspacecast:
109; HSA: enable_sgpr_queue_ptr = 0
110
111; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
112; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
113; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
114; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
115; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
116define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
117  %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
118  store volatile i32 7, i32* %stof
119  ret void
120}
121
122; no-op
123; HSA-LABEl: {{^}}use_constant_to_flat_addrspacecast:
124; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
125; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
126; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
127; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
128define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
129  %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
130  %ld = load volatile i32, i32* %stof
131  ret void
132}
133
134; HSA-LABEl: {{^}}use_constant_to_global_addrspacecast:
135; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}
136; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
137; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
138; HSA: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
139define amdgpu_kernel void @use_constant_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
140  %stof = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
141  %ld = load volatile i32, i32 addrspace(1)* %stof
142  ret void
143}
144
145; HSA-LABEL: {{^}}use_flat_to_group_addrspacecast:
146; HSA: enable_sgpr_private_segment_buffer = 1
147; HSA: enable_sgpr_dispatch_ptr = 0
148; HSA: enable_sgpr_queue_ptr = 0
149
150; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
151; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
152; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
153; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
154; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
155; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
156define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
157  %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
158  store volatile i32 0, i32 addrspace(3)* %ftos
159  ret void
160}
161
162; HSA-LABEL: {{^}}use_flat_to_private_addrspacecast:
163; HSA: enable_sgpr_private_segment_buffer = 1
164; HSA: enable_sgpr_dispatch_ptr = 0
165; HSA: enable_sgpr_queue_ptr = 0
166
167; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
168; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
169; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
170; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
171; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
172; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
173define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
174  %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
175  store volatile i32 0, i32 addrspace(5)* %ftos
176  ret void
177}
178
179; HSA-LABEL: {{^}}use_flat_to_global_addrspacecast:
180; HSA: enable_sgpr_queue_ptr = 0
181
182; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
183; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
184; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
185; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
186; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
187define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
188  %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
189  store volatile i32 0, i32 addrspace(1)* %ftos
190  ret void
191}
192
193; HSA-LABEL: {{^}}use_flat_to_constant_addrspacecast:
194; HSA: enable_sgpr_queue_ptr = 0
195
196; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
197; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
198define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
199  %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
200  load volatile i32, i32 addrspace(4)* %ftos
201  ret void
202}
203
204; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
205; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
206; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
207; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 16, 16)
208; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
209; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
210
211; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
212
213; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
214; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
215; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
216define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
217  %cast = addrspacecast i32 addrspace(3)* null to i32*
218  store volatile i32 7, i32* %cast
219  ret void
220}
221
222; HSA-LABEL: {{^}}cast_0_flat_to_group_addrspacecast:
223; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
224; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
225; HSA: ds_write_b32 [[PTR]], [[K]]
226define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
227  %cast = addrspacecast i32* null to i32 addrspace(3)*
228  store volatile i32 7, i32 addrspace(3)* %cast
229  ret void
230}
231
232; HSA-LABEL: {{^}}cast_neg1_group_to_flat_addrspacecast:
233; HSA: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
234; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
235; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
236; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
237define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
238  %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
239  store volatile i32 7, i32* %cast
240  ret void
241}
242
243; HSA-LABEL: {{^}}cast_neg1_flat_to_group_addrspacecast:
244; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
245; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
246; HSA: ds_write_b32 [[PTR]], [[K]]
247define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
248  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)*
249  store volatile i32 7, i32 addrspace(3)* %cast
250  ret void
251}
252
253; FIXME: Shouldn't need to enable queue ptr
254; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
255; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
256; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
257; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(HW_REG_SH_MEM_BASES, 0, 16)
258; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
259; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
260
261; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
262
263; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
264; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
265; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
266define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
267  %cast = addrspacecast i32 addrspace(5)* null to i32*
268  store volatile i32 7, i32* %cast
269  ret void
270}
271
272; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
273; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
274; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
275; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
276define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
277  %cast = addrspacecast i32* null to i32 addrspace(5)*
278  store volatile i32 7, i32 addrspace(5)* %cast
279  ret void
280}
281
282
283; HSA-LABEL: {{^}}cast_neg1_private_to_flat_addrspacecast:
284; CI: enable_sgpr_queue_ptr = 1
285; GFX9: enable_sgpr_queue_ptr = 0
286
287; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
288; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
289; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
290; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
291define amdgpu_kernel void @cast_neg1_private_to_flat_addrspacecast() #0 {
292  %cast = addrspacecast i32 addrspace(5)* inttoptr (i32 -1 to i32 addrspace(5)*) to i32*
293  store volatile i32 7, i32* %cast
294  ret void
295}
296
297; HSA-LABEL: {{^}}cast_neg1_flat_to_private_addrspacecast:
298; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
299; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
300; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, 0
301define amdgpu_kernel void @cast_neg1_flat_to_private_addrspacecast() #0 {
302  %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(5)*
303  store volatile i32 7, i32 addrspace(5)* %cast
304  ret void
305}
306
307
308; Disable optimizations in case there are optimizations added that
309; specialize away generic pointer accesses.
310
311; HSA-LABEL: {{^}}branch_use_flat_i32:
312; HSA: {{flat|global}}_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
313; HSA: s_endpgm
314define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
315entry:
316  %cmp = icmp ne i32 %c, 0
317  br i1 %cmp, label %local, label %global
318
319local:
320  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32*
321  br label %end
322
323global:
324  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32*
325  br label %end
326
327end:
328  %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ]
329  store volatile i32 %x, i32* %fptr, align 4
330;  %val = load i32, i32* %fptr, align 4
331;  store i32 %val, i32 addrspace(1)* %out, align 4
332  ret void
333}
334
335; Check for prologue initializing special SGPRs pointing to scratch.
336; HSA-LABEL: {{^}}store_flat_scratch:
337; CI-DAG: s_mov_b32 flat_scratch_lo, s9
338; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
339; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
340
341; GFX9: s_add_u32 flat_scratch_lo, s6, s9
342; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
343
344; HSA: {{flat|global}}_store_dword
345; HSA: s_barrier
346; HSA: {{flat|global}}_load_dword
347define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
348  %alloca = alloca i32, i32 9, align 4, addrspace(5)
349  %x = call i32 @llvm.amdgcn.workitem.id.x() #2
350  %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x
351  %fptr = addrspacecast i32 addrspace(5)* %pptr to i32*
352  store volatile i32 %x, i32* %fptr
353  ; Dummy call
354  call void @llvm.amdgcn.s.barrier() #1
355  %reload = load volatile i32, i32* %fptr, align 4
356  store volatile i32 %reload, i32 addrspace(1)* %out, align 4
357  ret void
358}
359
360; HSA-LABEL: {{^}}use_constant_to_constant32_addrspacecast
361; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
362; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
363; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
364; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
365; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
366; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
367define amdgpu_kernel void @use_constant_to_constant32_addrspacecast(i8 addrspace(4)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
368  %ptr = load volatile i8 addrspace(4)*, i8 addrspace(4)* addrspace(4)* %ptr.ptr
369  %addrspacecast = addrspacecast i8 addrspace(4)* %ptr to i8 addrspace(6)*
370  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
371  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
372  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
373  ret void
374}
375
376; HSA-LABEL: {{^}}use_global_to_constant32_addrspacecast
377; GFX9: s_load_dwordx2 [[PTRPTR:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0{{$}}
378; GFX9: s_load_dword [[OFFSET:s[0-9]+]], s[4:5], 0x8{{$}}
379; GFX9: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}, [[PTRPTR]], 0x0{{$}}
380; GFX9: s_mov_b32 s[[PTR_HI]], 0{{$}}
381; GFX9: s_add_i32 s[[PTR_LO]], s[[PTR_LO]], [[OFFSET]]
382; GFX9: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x0{{$}}
383define amdgpu_kernel void @use_global_to_constant32_addrspacecast(i8 addrspace(1)* addrspace(4)* %ptr.ptr, i32 %offset) #0 {
384  %ptr = load volatile i8 addrspace(1)*, i8 addrspace(1)* addrspace(4)* %ptr.ptr
385  %addrspacecast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(6)*
386  %gep = getelementptr i8, i8 addrspace(6)* %addrspacecast, i32 %offset
387  %ptr.cast = bitcast i8 addrspace(6)* %gep to i32 addrspace(6)*
388  %load = load volatile i32, i32 addrspace(6)* %ptr.cast, align 4
389  ret void
390}
391
392declare void @llvm.amdgcn.s.barrier() #1
393declare i32 @llvm.amdgcn.workitem.id.x() #2
394
395attributes #0 = { nounwind }
396attributes #1 = { nounwind convergent }
397attributes #2 = { nounwind readnone }
398