1; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
2; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=GCN-AA %s
3
4; This test is mostly to test DAG store merging, so disable the vectorizer.
5; Run with devices with different unaligned load restrictions.
6
7; TODO: Vector element tests
8; TODO: Non-zero base offset for load and store combinations
9; TODO: Same base addrspacecasted
10
11
12; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
13; GCN: buffer_store_short
14; GCN: s_endpgm
15define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
16  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
17
18  store i8 123, i8 addrspace(1)* %out.gep.1
19  store i8 456, i8 addrspace(1)* %out, align 2
20  ret void
21}
22
23; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
24; GCN: buffer_store_byte
25; GCN: buffer_store_byte
26; GCN: s_endpgm
27define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
28  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
29
30  store i8 123, i8 addrspace(1)* %out.gep.1
31  store i8 456, i8 addrspace(1)* %out
32  ret void
33}
34
35; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
36; GCN: buffer_store_dword v
37define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
38  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
39
40  store i16 123, i16 addrspace(1)* %out.gep.1
41  store i16 456, i16 addrspace(1)* %out, align 4
42  ret void
43}
44
45; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
46; GCN: buffer_store_dword v
47define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
48  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
49
50  store i16 0, i16 addrspace(1)* %out.gep.1
51  store i16 0, i16 addrspace(1)* %out, align 4
52  ret void
53}
54
55; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
56; GCN: buffer_store_short
57; GCN: buffer_store_short
58; GCN: s_endpgm
59define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
60  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
61
62  store i16 123, i16 addrspace(1)* %out.gep.1
63  store i16 456, i16 addrspace(1)* %out
64  ret void
65}
66
67; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
68; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
69; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
70; GCN: buffer_store_dwordx2 v[[[LO]]:[[HI]]]
71define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
72  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
73
74  store i32 123, i32 addrspace(1)* %out.gep.1
75  store i32 456, i32 addrspace(1)* %out
76  ret void
77}
78
79; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
80; GCN: buffer_store_dwordx2
81define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
82  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
83  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
84  store float 1.0, float addrspace(1)* %out.gep.1.bc
85  store i32 456, i32 addrspace(1)* %out
86  ret void
87}
88
89; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
90; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
91; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
92; GCN: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
93define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
94  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
95  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
96  store i32 123, i32 addrspace(1)* %out.gep.1.bc
97  store float 4.0, float addrspace(1)* %out
98  ret void
99}
100
101; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
102; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
103; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
104; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
105; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
106; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI]]]
107define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
108  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
109  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
110  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
111
112  store i32 123, i32 addrspace(1)* %out.gep.1
113  store i32 456, i32 addrspace(1)* %out.gep.2
114  store i32 333, i32 addrspace(1)* %out.gep.3
115  store i32 1234, i32 addrspace(1)* %out
116  ret void
117}
118
119; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
120; GCN: buffer_store_dwordx4
121define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
122  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
123  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
124  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
125
126  store float 8.0, float addrspace(1)* %out
127  store float 1.0, float addrspace(1)* %out.gep.1
128  store float 2.0, float addrspace(1)* %out.gep.2
129  store float 4.0, float addrspace(1)* %out.gep.3
130  ret void
131}
132
133; First store is out of order.
134; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
135; GCN: buffer_store_dwordx4
136define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
137  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
138  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
139  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
140
141  store float 1.0, float addrspace(1)* %out.gep.1
142  store float 2.0, float addrspace(1)* %out.gep.2
143  store float 4.0, float addrspace(1)* %out.gep.3
144  store float 8.0, float addrspace(1)* %out
145  ret void
146}
147
148; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
149; GCN-AA: buffer_store_dwordx4 v
150; GCN: s_endpgm
151define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
152  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
153  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
154  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
155
156  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
157  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
158
159  store i32 11, i32 addrspace(1)* %out.gep.1.bc
160  store float 2.0, float addrspace(1)* %out.gep.2
161  store i32 17, i32 addrspace(1)* %out.gep.3.bc
162  store float 8.0, float addrspace(1)* %out
163  ret void
164}
165
166; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
167; SI-DAG: buffer_store_dwordx2
168; SI-DAG: buffer_store_dword v
169; CI-DAG: buffer_store_dwordx3
170; GCN-NOT: buffer_store_dword
171; GCN: s_endpgm
172define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
173  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
174  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
175
176  store i32 123, i32 addrspace(1)* %out.gep.1
177  store i32 456, i32 addrspace(1)* %out.gep.2
178  store i32 1234, i32 addrspace(1)* %out
179  ret void
180}
181
182; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
183; GCN: buffer_store_dwordx4
184define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
185  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
186
187  store i64 123, i64 addrspace(1)* %out.gep.1
188  store i64 456, i64 addrspace(1)* %out
189  ret void
190}
191
192; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
193; GCN: buffer_store_dwordx4
194; GCN: buffer_store_dwordx4
195define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
196  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
197  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
198  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
199
200  store i64 123, i64 addrspace(1)* %out.gep.1
201  store i64 456, i64 addrspace(1)* %out.gep.2
202  store i64 333, i64 addrspace(1)* %out.gep.3
203  store i64 1234, i64 addrspace(1)* %out
204  ret void
205}
206
207; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
208; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
209; GCN: buffer_store_dwordx2 [[LOAD]]
210define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
211  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
212  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
213
214  %lo = load i32, i32 addrspace(1)* %in
215  %hi = load i32, i32 addrspace(1)* %in.gep.1
216
217  store i32 %lo, i32 addrspace(1)* %out
218  store i32 %hi, i32 addrspace(1)* %out.gep.1
219  ret void
220}
221
222; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
223; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
224; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
225define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
226  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
227  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
228
229  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
230  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
231  %lo = load i32, i32 addrspace(1)* %in.gep.0
232  %hi = load i32, i32 addrspace(1)* %in.gep.1
233
234  store i32 %lo, i32 addrspace(1)* %out.gep.0
235  store i32 %hi, i32 addrspace(1)* %out.gep.1
236  ret void
237}
238
239; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
240; GCN: buffer_load_dwordx2 v
241; GCN: buffer_store_dwordx2 v
242define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
243  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
244  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
245
246  %lo = load i32, i32 addrspace(1)* %in
247  %hi = load i32, i32 addrspace(1)* %in.gep.1
248
249  store i32 %hi, i32 addrspace(1)* %out
250  store i32 %lo, i32 addrspace(1)* %out.gep.1
251  ret void
252}
253
254; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
255; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
256; GCN: buffer_store_dwordx4 [[LOAD]]
257define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
258  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
259  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
260  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
261  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
262  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
263  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
264
265  %x = load i32, i32 addrspace(1)* %in
266  %y = load i32, i32 addrspace(1)* %in.gep.1
267  %z = load i32, i32 addrspace(1)* %in.gep.2
268  %w = load i32, i32 addrspace(1)* %in.gep.3
269
270  store i32 %x, i32 addrspace(1)* %out
271  store i32 %y, i32 addrspace(1)* %out.gep.1
272  store i32 %z, i32 addrspace(1)* %out.gep.2
273  store i32 %w, i32 addrspace(1)* %out.gep.3
274  ret void
275}
276
277; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
278; SI-DAG: buffer_load_dwordx2
279; SI-DAG: buffer_load_dword
280; CI-DAG: buffer_load_dwordx3
281; GCN: s_waitcnt
282; SI-DAG: buffer_store_dwordx2
283; SI-DAG: buffer_store_dword v
284; CI-DAG: buffer_store_dwordx3
285; GCN: s_endpgm
286define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
287  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
288  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
289  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
290  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
291
292  %x = load i32, i32 addrspace(1)* %in
293  %y = load i32, i32 addrspace(1)* %in.gep.1
294  %z = load i32, i32 addrspace(1)* %in.gep.2
295
296  store i32 %x, i32 addrspace(1)* %out
297  store i32 %y, i32 addrspace(1)* %out.gep.1
298  store i32 %z, i32 addrspace(1)* %out.gep.2
299  ret void
300}
301
302; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
303; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
304; GCN: buffer_store_dwordx4 [[LOAD]]
305define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
306  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
307  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
308  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
309  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
310  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
311  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
312
313  %x = load float, float addrspace(1)* %in
314  %y = load float, float addrspace(1)* %in.gep.1
315  %z = load float, float addrspace(1)* %in.gep.2
316  %w = load float, float addrspace(1)* %in.gep.3
317
318  store float %x, float addrspace(1)* %out
319  store float %y, float addrspace(1)* %out.gep.1
320  store float %z, float addrspace(1)* %out.gep.2
321  store float %w, float addrspace(1)* %out.gep.3
322  ret void
323}
324
325; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
326; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
327; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
328define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
329  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
330  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
331  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
332  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
333  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
334  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
335  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
336  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
337
338  %x = load i32, i32 addrspace(1)* %in.gep.0
339  %y = load i32, i32 addrspace(1)* %in.gep.1
340  %z = load i32, i32 addrspace(1)* %in.gep.2
341  %w = load i32, i32 addrspace(1)* %in.gep.3
342
343  store i32 %x, i32 addrspace(1)* %out.gep.0
344  store i32 %y, i32 addrspace(1)* %out.gep.1
345  store i32 %z, i32 addrspace(1)* %out.gep.2
346  store i32 %w, i32 addrspace(1)* %out.gep.3
347  ret void
348}
349
350; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
351; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
352; GCN: s_barrier
353; GCN: buffer_store_dwordx4 [[LOAD]]
354define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
355  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
356  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
357  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
358  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
359  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
360  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
361
362  %x = load i32, i32 addrspace(1)* %in
363  %y = load i32, i32 addrspace(1)* %in.gep.1
364  %z = load i32, i32 addrspace(1)* %in.gep.2
365  %w = load i32, i32 addrspace(1)* %in.gep.3
366
367  ; Make sure the barrier doesn't stop this
368  tail call void @llvm.amdgcn.s.barrier() #1
369
370  store i32 %w, i32 addrspace(1)* %out.gep.3
371  store i32 %z, i32 addrspace(1)* %out.gep.2
372  store i32 %y, i32 addrspace(1)* %out.gep.1
373  store i32 %x, i32 addrspace(1)* %out
374
375  ret void
376}
377
378; TODO: Re-packing of loaded register required. Maybe an IR pass
379; should catch this?
380
381; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
382; GCN: buffer_load_dwordx4 v
383; GCN: s_barrier
384; GCN: buffer_store_dwordx4 v
385define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
386  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
387  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
388  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
389  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
390  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
391  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
392
393  %x = load i32, i32 addrspace(1)* %in
394  %y = load i32, i32 addrspace(1)* %in.gep.1
395  %z = load i32, i32 addrspace(1)* %in.gep.2
396  %w = load i32, i32 addrspace(1)* %in.gep.3
397
398  ; Make sure the barrier doesn't stop this
399  tail call void @llvm.amdgcn.s.barrier() #1
400
401  store i32 %w, i32 addrspace(1)* %out
402  store i32 %z, i32 addrspace(1)* %out.gep.1
403  store i32 %y, i32 addrspace(1)* %out.gep.2
404  store i32 %x, i32 addrspace(1)* %out.gep.3
405
406  ret void
407}
408
409; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
410; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
411; GCN: buffer_store_dword [[LOAD]]
412; GCN: s_endpgm
413define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
414  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
415  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
416  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
417  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
418  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
419  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
420
421  %x = load i8, i8 addrspace(1)* %in, align 4
422  %y = load i8, i8 addrspace(1)* %in.gep.1
423  %z = load i8, i8 addrspace(1)* %in.gep.2
424  %w = load i8, i8 addrspace(1)* %in.gep.3
425
426  store i8 %x, i8 addrspace(1)* %out, align 4
427  store i8 %y, i8 addrspace(1)* %out.gep.1
428  store i8 %z, i8 addrspace(1)* %out.gep.2
429  store i8 %w, i8 addrspace(1)* %out.gep.3
430  ret void
431}
432
433; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
434; GCN: buffer_load_ubyte
435; GCN: buffer_load_ubyte
436; GCN: buffer_load_ubyte
437; GCN: buffer_load_ubyte
438; GCN: buffer_store_byte
439; GCN: buffer_store_byte
440; GCN: buffer_store_byte
441; GCN: buffer_store_byte
442; GCN: s_endpgm
443define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
444  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
445  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
446  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
447  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
448  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
449  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
450
451  %x = load i8, i8 addrspace(1)* %in
452  %y = load i8, i8 addrspace(1)* %in.gep.1
453  %z = load i8, i8 addrspace(1)* %in.gep.2
454  %w = load i8, i8 addrspace(1)* %in.gep.3
455
456  store i8 %x, i8 addrspace(1)* %out
457  store i8 %y, i8 addrspace(1)* %out.gep.1
458  store i8 %z, i8 addrspace(1)* %out.gep.2
459  store i8 %w, i8 addrspace(1)* %out.gep.3
460  ret void
461}
462
463; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
464; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
465; GCN: buffer_store_dwordx4 [[LOAD]]
466; GCN: s_endpgm
467define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
468  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
469  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
470  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
471  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
472
473  %x = extractelement <4 x i32> %vec, i32 0
474  %y = extractelement <4 x i32> %vec, i32 1
475  %z = extractelement <4 x i32> %vec, i32 2
476  %w = extractelement <4 x i32> %vec, i32 3
477
478  store i32 %x, i32 addrspace(1)* %out
479  store i32 %y, i32 addrspace(1)* %out.gep.1
480  store i32 %z, i32 addrspace(1)* %out.gep.2
481  store i32 %w, i32 addrspace(1)* %out.gep.3
482  ret void
483}
484
485; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
486; GCN: ds_write_b16
487; GCN: s_endpgm
488define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
489  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
490
491  store i8 123, i8 addrspace(3)* %out.gep.1
492  store i8 456, i8 addrspace(3)* %out, align 2
493  ret void
494}
495
496; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
497; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
498; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
499; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
500define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
501  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
502
503  store i32 123, i32 addrspace(3)* %out.gep.1
504  store i32 456, i32 addrspace(3)* %out
505  ret void
506}
507
508; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
509; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
510; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
511; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
512
513; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
514; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
515; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
516
517; GCN: s_endpgm
518define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
519  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
520  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
521  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
522
523  store i32 123, i32 addrspace(3)* %out.gep.1
524  store i32 456, i32 addrspace(3)* %out.gep.2
525  store i32 333, i32 addrspace(3)* %out.gep.3
526  store i32 1234, i32 addrspace(3)* %out
527  ret void
528}
529
530; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
531; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
532; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
533; GCN: buffer_store_dwordx4 v[[[LO]]:[[HI4]]]
534; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
535; GCN: buffer_store_dword v[[HI]]
536define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
537  store i32 9, i32 addrspace(1)* %out, align 4
538  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
539  store i32 12, i32 addrspace(1)* %idx1, align 4
540  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
541  store i32 16, i32 addrspace(1)* %idx2, align 4
542  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
543  store i32 -12, i32 addrspace(1)* %idx3, align 4
544  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
545  store i32 11, i32 addrspace(1)* %idx4, align 4
546  ret void
547}
548
549; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
550; GCN: buffer_store_dwordx4
551; GCN: buffer_store_dwordx2
552define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
553  store i32 13, i32 addrspace(1)* %out, align 4
554  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
555  store i32 15, i32 addrspace(1)* %idx1, align 4
556  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
557  store i32 62, i32 addrspace(1)* %idx2, align 4
558  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
559  store i32 63, i32 addrspace(1)* %idx3, align 4
560  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
561  store i32 11, i32 addrspace(1)* %idx4, align 4
562  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
563  store i32 123, i32 addrspace(1)* %idx5, align 4
564  ret void
565}
566
567; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
568; GCN: buffer_store_dwordx4
569; SI-DAG: buffer_store_dwordx2
570; CI: buffer_store_dwordx3
571define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
572  store i32 34, i32 addrspace(1)* %out, align 4
573  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
574  store i32 999, i32 addrspace(1)* %idx1, align 4
575  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
576  store i32 65, i32 addrspace(1)* %idx2, align 4
577  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
578  store i32 33, i32 addrspace(1)* %idx3, align 4
579  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
580  store i32 98, i32 addrspace(1)* %idx4, align 4
581  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
582  store i32 91, i32 addrspace(1)* %idx5, align 4
583  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
584  store i32 212, i32 addrspace(1)* %idx6, align 4
585  ret void
586}
587
588; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
589; GCN: buffer_store_dwordx4
590; GCN: buffer_store_dwordx4
591; GCN: s_endpgm
592define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
593  store i32 34, i32 addrspace(1)* %out, align 4
594  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
595  store i32 999, i32 addrspace(1)* %idx1, align 4
596  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
597  store i32 65, i32 addrspace(1)* %idx2, align 4
598  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
599  store i32 33, i32 addrspace(1)* %idx3, align 4
600  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
601  store i32 98, i32 addrspace(1)* %idx4, align 4
602  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
603  store i32 91, i32 addrspace(1)* %idx5, align 4
604  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
605  store i32 212, i32 addrspace(1)* %idx6, align 4
606  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
607  store i32 999, i32 addrspace(1)* %idx7, align 4
608  ret void
609}
610
611; This requires handling of scalar_to_vector for v2i64 to avoid
612; scratch usage.
613; FIXME: Should do single load and store
614
615; GCN-LABEL: {{^}}copy_v3i32_align4:
616; GCN-NOT: SCRATCH_RSRC_DWORD
617; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
618; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
619; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
620; GCN-NOT: offen
621; GCN: s_waitcnt vmcnt
622; GCN-NOT: offen
623; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
624; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
625; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
626
627; GCN: ScratchSize: 0{{$}}
628define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
629  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
630  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
631  ret void
632}
633
634; GCN-LABEL: {{^}}copy_v3i64_align4:
635; GCN-NOT: SCRATCH_RSRC_DWORD
636; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
637; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
638; GCN-NOT: offen
639; GCN: s_waitcnt vmcnt
640; GCN-NOT: offen
641; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
642; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
643; GCN: ScratchSize: 0{{$}}
644define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
645  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
646  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
647  ret void
648}
649
650; GCN-LABEL: {{^}}copy_v3f32_align4:
651; GCN-NOT: SCRATCH_RSRC_DWORD
652; SI-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
653; SI-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
654; CI-DAG: buffer_load_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
655; GCN-NOT: offen
656; GCN: s_waitcnt vmcnt
657; GCN-NOT: offen
658; SI-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
659; SI-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
660; CI-DAG: buffer_store_dwordx3 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
661; GCN: ScratchSize: 0{{$}}
662define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
663  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
664  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
665  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
666  ret void
667}
668
669; GCN-LABEL: {{^}}copy_v3f64_align4:
670; GCN-NOT: SCRATCH_RSRC_DWORD
671; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
672; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
673; GCN-NOT: offen
674; GCN: s_waitcnt vmcnt
675; GCN-NOT: offen
676; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
677; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
678; GCN: ScratchSize: 0{{$}}
679define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
680  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
681  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
682  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
683  ret void
684}
685
686declare void @llvm.amdgcn.s.barrier() #1
687
688attributes #0 = { nounwind }
689attributes #1 = { convergent nounwind }
690