1; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s
2; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI,CIVI-HSA %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX9 %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,GFX10 %s
6
7; CHECK-LABEL: {{^}}store_flat_i32:
8; CHECK-DAG: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]],
9; CHECK-DAG: s_load_dword s[[SDATA:[0-9]+]],
10; CHECK: s_waitcnt lgkmcnt(0)
11; CHECK-DAG: v_mov_b32_e32 v[[DATA:[0-9]+]], s[[SDATA]]
12; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
13; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
14; CHECK: flat_store_dword v[[[LO_VREG]]:[[HI_VREG]]], v[[DATA]]
15define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
16  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
17  store volatile i32 %x, i32* %fptr, align 4
18  ret void
19}
20
21; CHECK-LABEL: {{^}}store_flat_i64:
22; CHECK: flat_store_dwordx2
23define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
24  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
25  store volatile i64 %x, i64* %fptr, align 8
26  ret void
27}
28
29; CHECK-LABEL: {{^}}store_flat_v4i32:
30; CHECK: flat_store_dwordx4
31define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
32  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
33  store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16
34  ret void
35}
36
37; CHECK-LABEL: {{^}}store_flat_trunc_i16:
38; CHECK: flat_store_short
39define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
40  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
41  %y = trunc i32 %x to i16
42  store volatile i16 %y, i16* %fptr, align 2
43  ret void
44}
45
46; CHECK-LABEL: {{^}}store_flat_trunc_i8:
47; CHECK: flat_store_byte
48define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
49  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
50  %y = trunc i32 %x to i8
51  store volatile i8 %y, i8* %fptr, align 2
52  ret void
53}
54
55
56
57; CHECK-LABEL: load_flat_i32:
58; CHECK: flat_load_dword
59define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
60  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
61  %fload = load volatile i32, i32* %fptr, align 4
62  store i32 %fload, i32 addrspace(1)* %out, align 4
63  ret void
64}
65
66; CHECK-LABEL: load_flat_i64:
67; CHECK: flat_load_dwordx2
68define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
69  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
70  %fload = load volatile i64, i64* %fptr, align 8
71  store i64 %fload, i64 addrspace(1)* %out, align 8
72  ret void
73}
74
75; CHECK-LABEL: load_flat_v4i32:
76; CHECK: flat_load_dwordx4
77define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
78  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
79  %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32
80  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
81  ret void
82}
83
84; CHECK-LABEL: sextload_flat_i8:
85; CHECK: flat_load_sbyte
86define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
87  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
88  %fload = load volatile i8, i8* %fptr, align 4
89  %ext = sext i8 %fload to i32
90  store i32 %ext, i32 addrspace(1)* %out, align 4
91  ret void
92}
93
94; CHECK-LABEL: zextload_flat_i8:
95; CHECK: flat_load_ubyte
96define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
97  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
98  %fload = load volatile i8, i8* %fptr, align 4
99  %ext = zext i8 %fload to i32
100  store i32 %ext, i32 addrspace(1)* %out, align 4
101  ret void
102}
103
104; CHECK-LABEL: sextload_flat_i16:
105; CHECK: flat_load_sshort
106define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
107  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
108  %fload = load volatile i16, i16* %fptr, align 4
109  %ext = sext i16 %fload to i32
110  store i32 %ext, i32 addrspace(1)* %out, align 4
111  ret void
112}
113
114; CHECK-LABEL: zextload_flat_i16:
115; CHECK: flat_load_ushort
116define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
117  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
118  %fload = load volatile i16, i16* %fptr, align 4
119  %ext = zext i16 %fload to i32
120  store i32 %ext, i32 addrspace(1)* %out, align 4
121  ret void
122}
123
124; CHECK-LABEL: flat_scratch_unaligned_load:
125; CHECK: flat_load_ubyte
126; CHECK: flat_load_ubyte
127; CHECK: flat_load_ubyte
128; CHECK: flat_load_ubyte
129define amdgpu_kernel void @flat_scratch_unaligned_load() {
130  %scratch = alloca i32, addrspace(5)
131  %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
132  %ld = load volatile i32, i32* %fptr, align 1
133  ret void
134}
135
136; CHECK-LABEL: flat_scratch_unaligned_store:
137; CHECK: flat_store_byte
138; CHECK: flat_store_byte
139; CHECK: flat_store_byte
140; CHECK: flat_store_byte
141define amdgpu_kernel void @flat_scratch_unaligned_store() {
142  %scratch = alloca i32, addrspace(5)
143  %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
144  store volatile i32 0, i32* %fptr, align 1
145  ret void
146}
147
148; CHECK-LABEL: flat_scratch_multidword_load:
149; CIVI-HSA: flat_load_dword v
150; CIVI-HSA: flat_load_dword v
151; GFX9:  flat_load_dwordx2
152; GFX10: flat_load_dwordx2
153; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
154define amdgpu_kernel void @flat_scratch_multidword_load() {
155  %scratch = alloca <2 x i32>, addrspace(5)
156  %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
157  %ld = load volatile <2 x i32>, <2 x i32>* %fptr
158  ret void
159}
160
161; CHECK-LABEL: flat_scratch_multidword_store:
162; CIVI-HSA: flat_store_dword v
163; CIVI-HSA: flat_store_dword v
164; GFX9:  flat_store_dwordx2
165; GFX10: flat_store_dwordx2
166; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
167define amdgpu_kernel void @flat_scratch_multidword_store() {
168  %scratch = alloca <2 x i32>, addrspace(5)
169  %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
170  store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr
171  ret void
172}
173
174; CHECK-LABEL: {{^}}store_flat_i8_max_offset:
175; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
176; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
177define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 {
178  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
179  store volatile i8 %x, i8* %fptr.offset
180  ret void
181}
182
183; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1:
184; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
185define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
186  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
187  store volatile i8 %x, i8* %fptr.offset
188  ret void
189}
190
191; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
192; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
193
194; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
195; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
196; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
197define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
198  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
199  store volatile i8 %x, i8* %fptr.offset
200  ret void
201}
202
203; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
204; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
205; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095 glc{{$}}
206; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
207define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
208  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
209  %val = load volatile i8, i8* %fptr.offset
210  ret void
211}
212
213; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
214; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
215; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
216; GFX10: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc dlc{{$}}
217define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
218  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
219  %val = load volatile i8, i8* %fptr.offset
220  ret void
221}
222
223; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
224; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
225
226; GFX9: v_add_co_u32_e64 v{{[0-9]+}}, vcc, -2, s
227; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1,
228; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
229define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
230  %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
231  %val = load volatile i8, i8* %fptr.offset
232  ret void
233}
234
235attributes #0 = { nounwind }
236attributes #1 = { nounwind convergent }
237