1; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
3
4; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
5; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
6
7; This test is mostly to test DAG store merging, so disable the vectorizer.
8; Run with devices with different unaligned load restrictions.
9
10; TODO: Vector element tests
11; TODO: Non-zero base offset for load and store combinations
12; TODO: Same base addrspacecasted
13
14
15; GCN-LABEL: {{^}}merge_global_store_2_constants_i8:
16; GCN: buffer_store_byte
17; GCN: buffer_store_byte
18; GCN: s_endpgm
19define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
20  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
21
22  store i8 123, i8 addrspace(1)* %out.gep.1
23  store i8 456, i8 addrspace(1)* %out, align 2
24  ret void
25}
26
27; GCN-LABEL: {{^}}merge_global_store_2_constants_i8_natural_align:
28; GCN: buffer_store_byte
29; GCN: buffer_store_byte
30; GCN: s_endpgm
31define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
32  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
33
34  store i8 123, i8 addrspace(1)* %out.gep.1
35  store i8 456, i8 addrspace(1)* %out
36  ret void
37}
38
39; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
40; GCN: buffer_store_dword v
41define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
42  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
43
44  store i16 123, i16 addrspace(1)* %out.gep.1
45  store i16 456, i16 addrspace(1)* %out, align 4
46  ret void
47}
48
49; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
50; GCN: buffer_store_dword v
51define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
52  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
53
54  store i16 0, i16 addrspace(1)* %out.gep.1
55  store i16 0, i16 addrspace(1)* %out, align 4
56  ret void
57}
58
59; GCN-LABEL: {{^}}merge_global_store_2_constants_i16_natural_align:
60; GCN: buffer_store_short
61; GCN: buffer_store_short
62; GCN: s_endpgm
63define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
64  %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
65
66  store i16 123, i16 addrspace(1)* %out.gep.1
67  store i16 456, i16 addrspace(1)* %out
68  ret void
69}
70
71; GCN-LABEL: {{^}}merge_global_store_2_constants_i32:
72; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
73; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
74; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
75define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
76  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
77
78  store i32 123, i32 addrspace(1)* %out.gep.1
79  store i32 456, i32 addrspace(1)* %out
80  ret void
81}
82
83; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
84; GCN: buffer_store_dwordx2
85define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
86  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
87  %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
88  store float 1.0, float addrspace(1)* %out.gep.1.bc
89  store i32 456, i32 addrspace(1)* %out
90  ret void
91}
92
93; GCN-LABEL: {{^}}merge_global_store_2_constants_f32_i32:
94; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
95; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
96; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
97define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
98  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
99  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
100  store i32 123, i32 addrspace(1)* %out.gep.1.bc
101  store float 4.0, float addrspace(1)* %out
102  ret void
103}
104
105; GCN-LABEL: {{^}}merge_global_store_4_constants_i32:
106; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x14d{{$}}
107; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x1c8{{$}}
108; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
109; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
110; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
111define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
112  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
113  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
114  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
115
116  store i32 123, i32 addrspace(1)* %out.gep.1
117  store i32 456, i32 addrspace(1)* %out.gep.2
118  store i32 333, i32 addrspace(1)* %out.gep.3
119  store i32 1234, i32 addrspace(1)* %out
120  ret void
121}
122
123; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
124; GCN: buffer_store_dwordx4
125define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
126  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
127  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
128  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
129
130  store float 8.0, float addrspace(1)* %out
131  store float 1.0, float addrspace(1)* %out.gep.1
132  store float 2.0, float addrspace(1)* %out.gep.2
133  store float 4.0, float addrspace(1)* %out.gep.3
134  ret void
135}
136
137; First store is out of order.
138; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
139; GCN: buffer_store_dwordx4
140define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
141  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
142  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
143  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
144
145  store float 1.0, float addrspace(1)* %out.gep.1
146  store float 2.0, float addrspace(1)* %out.gep.2
147  store float 4.0, float addrspace(1)* %out.gep.3
148  store float 8.0, float addrspace(1)* %out
149  ret void
150}
151
152; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
153; GCN-NOAA: buffer_store_dwordx4 v
154
155; GCN-AA: buffer_store_dwordx2
156; GCN-AA: buffer_store_dword v
157; GCN-AA: buffer_store_dword v
158
159; GCN: s_endpgm
160define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
161  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
162  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
163  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
164
165  %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
166  %out.gep.3.bc = bitcast float addrspace(1)* %out.gep.3 to i32 addrspace(1)*
167
168  store i32 11, i32 addrspace(1)* %out.gep.1.bc
169  store float 2.0, float addrspace(1)* %out.gep.2
170  store i32 17, i32 addrspace(1)* %out.gep.3.bc
171  store float 8.0, float addrspace(1)* %out
172  ret void
173}
174
175; GCN-LABEL: {{^}}merge_global_store_3_constants_i32:
176; SI-DAG: buffer_store_dwordx2
177; SI-DAG: buffer_store_dword
178; SI-NOT: buffer_store_dword
179; GCN: s_endpgm
180define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
181  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
182  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
183
184  store i32 123, i32 addrspace(1)* %out.gep.1
185  store i32 456, i32 addrspace(1)* %out.gep.2
186  store i32 1234, i32 addrspace(1)* %out
187  ret void
188}
189
190; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
191; GCN: buffer_store_dwordx4
192define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
193  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
194
195  store i64 123, i64 addrspace(1)* %out.gep.1
196  store i64 456, i64 addrspace(1)* %out
197  ret void
198}
199
200; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
201; GCN: buffer_store_dwordx4
202; GCN: buffer_store_dwordx4
203define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
204  %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
205  %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
206  %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
207
208  store i64 123, i64 addrspace(1)* %out.gep.1
209  store i64 456, i64 addrspace(1)* %out.gep.2
210  store i64 333, i64 addrspace(1)* %out.gep.3
211  store i64 1234, i64 addrspace(1)* %out
212  ret void
213}
214
215; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
216; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
217; GCN: buffer_store_dwordx2 [[LOAD]]
218define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
219  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
220  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
221
222  %lo = load i32, i32 addrspace(1)* %in
223  %hi = load i32, i32 addrspace(1)* %in.gep.1
224
225  store i32 %lo, i32 addrspace(1)* %out
226  store i32 %hi, i32 addrspace(1)* %out.gep.1
227  ret void
228}
229
230; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
231; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
232; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
233define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
234  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
235  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
236
237  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 2
238  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 3
239  %lo = load i32, i32 addrspace(1)* %in.gep.0
240  %hi = load i32, i32 addrspace(1)* %in.gep.1
241
242  store i32 %lo, i32 addrspace(1)* %out.gep.0
243  store i32 %hi, i32 addrspace(1)* %out.gep.1
244  ret void
245}
246
247; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_shuffle_i32:
248; GCN: buffer_load_dword v
249; GCN: buffer_load_dword v
250; GCN: buffer_store_dword v
251; GCN: buffer_store_dword v
252define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
253  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
254  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
255
256  %lo = load i32, i32 addrspace(1)* %in
257  %hi = load i32, i32 addrspace(1)* %in.gep.1
258
259  store i32 %hi, i32 addrspace(1)* %out
260  store i32 %lo, i32 addrspace(1)* %out.gep.1
261  ret void
262}
263
264; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
265; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
266; GCN: buffer_store_dwordx4 [[LOAD]]
267define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
268  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
269  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
270  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
271  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
272  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
273  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
274
275  %x = load i32, i32 addrspace(1)* %in
276  %y = load i32, i32 addrspace(1)* %in.gep.1
277  %z = load i32, i32 addrspace(1)* %in.gep.2
278  %w = load i32, i32 addrspace(1)* %in.gep.3
279
280  store i32 %x, i32 addrspace(1)* %out
281  store i32 %y, i32 addrspace(1)* %out.gep.1
282  store i32 %z, i32 addrspace(1)* %out.gep.2
283  store i32 %w, i32 addrspace(1)* %out.gep.3
284  ret void
285}
286
287; GCN-LABEL: {{^}}merge_global_store_3_adjacent_loads_i32:
288; SI-DAG: buffer_load_dwordx2
289; SI-DAG: buffer_load_dword v
290; GCN: s_waitcnt
291; SI-DAG: buffer_store_dword v
292; SI-DAG: buffer_store_dwordx2 v
293; GCN: s_endpgm
294define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
295  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
296  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
297  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
298  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
299
300  %x = load i32, i32 addrspace(1)* %in
301  %y = load i32, i32 addrspace(1)* %in.gep.1
302  %z = load i32, i32 addrspace(1)* %in.gep.2
303
304  store i32 %x, i32 addrspace(1)* %out
305  store i32 %y, i32 addrspace(1)* %out.gep.1
306  store i32 %z, i32 addrspace(1)* %out.gep.2
307  ret void
308}
309
310; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
311; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
312; GCN: buffer_store_dwordx4 [[LOAD]]
313define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
314  %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
315  %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
316  %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
317  %in.gep.1 = getelementptr float, float addrspace(1)* %in, i32 1
318  %in.gep.2 = getelementptr float, float addrspace(1)* %in, i32 2
319  %in.gep.3 = getelementptr float, float addrspace(1)* %in, i32 3
320
321  %x = load float, float addrspace(1)* %in
322  %y = load float, float addrspace(1)* %in.gep.1
323  %z = load float, float addrspace(1)* %in.gep.2
324  %w = load float, float addrspace(1)* %in.gep.3
325
326  store float %x, float addrspace(1)* %out
327  store float %y, float addrspace(1)* %out.gep.1
328  store float %z, float addrspace(1)* %out.gep.2
329  store float %w, float addrspace(1)* %out.gep.3
330  ret void
331}
332
333; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
334; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
335; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
336define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
337  %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
338  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
339  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
340  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 14
341  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i32 7
342  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 8
343  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 9
344  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 10
345
346  %x = load i32, i32 addrspace(1)* %in.gep.0
347  %y = load i32, i32 addrspace(1)* %in.gep.1
348  %z = load i32, i32 addrspace(1)* %in.gep.2
349  %w = load i32, i32 addrspace(1)* %in.gep.3
350
351  store i32 %x, i32 addrspace(1)* %out.gep.0
352  store i32 %y, i32 addrspace(1)* %out.gep.1
353  store i32 %z, i32 addrspace(1)* %out.gep.2
354  store i32 %w, i32 addrspace(1)* %out.gep.3
355  ret void
356}
357
358; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_inverse_i32:
359; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
360; GCN: s_barrier
361; GCN: buffer_store_dwordx4 [[LOAD]]
362define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
363  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
364  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
365  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
366  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
367  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
368  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
369
370  %x = load i32, i32 addrspace(1)* %in
371  %y = load i32, i32 addrspace(1)* %in.gep.1
372  %z = load i32, i32 addrspace(1)* %in.gep.2
373  %w = load i32, i32 addrspace(1)* %in.gep.3
374
375  ; Make sure the barrier doesn't stop this
376  tail call void @llvm.amdgcn.s.barrier() #1
377
378  store i32 %w, i32 addrspace(1)* %out.gep.3
379  store i32 %z, i32 addrspace(1)* %out.gep.2
380  store i32 %y, i32 addrspace(1)* %out.gep.1
381  store i32 %x, i32 addrspace(1)* %out
382
383  ret void
384}
385
386; TODO: Re-packing of loaded register required. Maybe an IR pass
387; should catch this?
388
389; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_shuffle_i32:
390; GCN: buffer_load_dword v
391; GCN: buffer_load_dword v
392; GCN: buffer_load_dword v
393; GCN: buffer_load_dword v
394; GCN: s_barrier
395; GCN: buffer_store_dword v
396; GCN: buffer_store_dword v
397; GCN: buffer_store_dword v
398; GCN: buffer_store_dword v
399define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
400  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
401  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
402  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
403  %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
404  %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 2
405  %in.gep.3 = getelementptr i32, i32 addrspace(1)* %in, i32 3
406
407  %x = load i32, i32 addrspace(1)* %in
408  %y = load i32, i32 addrspace(1)* %in.gep.1
409  %z = load i32, i32 addrspace(1)* %in.gep.2
410  %w = load i32, i32 addrspace(1)* %in.gep.3
411
412  ; Make sure the barrier doesn't stop this
413  tail call void @llvm.amdgcn.s.barrier() #1
414
415  store i32 %w, i32 addrspace(1)* %out
416  store i32 %z, i32 addrspace(1)* %out.gep.1
417  store i32 %y, i32 addrspace(1)* %out.gep.2
418  store i32 %x, i32 addrspace(1)* %out.gep.3
419
420  ret void
421}
422
423; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8:
424; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
425; GCN: buffer_store_dword [[LOAD]]
426; GCN: s_endpgm
427define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
428  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
429  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
430  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
431  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
432  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
433  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
434
435  %x = load i8, i8 addrspace(1)* %in, align 4
436  %y = load i8, i8 addrspace(1)* %in.gep.1
437  %z = load i8, i8 addrspace(1)* %in.gep.2
438  %w = load i8, i8 addrspace(1)* %in.gep.3
439
440  store i8 %x, i8 addrspace(1)* %out, align 4
441  store i8 %y, i8 addrspace(1)* %out.gep.1
442  store i8 %z, i8 addrspace(1)* %out.gep.2
443  store i8 %w, i8 addrspace(1)* %out.gep.3
444  ret void
445}
446
447; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i8_natural_align:
448; GCN: buffer_load_ubyte
449; GCN: buffer_load_ubyte
450; GCN: buffer_load_ubyte
451; GCN: buffer_load_ubyte
452; GCN: buffer_store_byte
453; GCN: buffer_store_byte
454; GCN: buffer_store_byte
455; GCN: buffer_store_byte
456; GCN: s_endpgm
457define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
458  %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
459  %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
460  %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
461  %in.gep.1 = getelementptr i8, i8 addrspace(1)* %in, i8 1
462  %in.gep.2 = getelementptr i8, i8 addrspace(1)* %in, i8 2
463  %in.gep.3 = getelementptr i8, i8 addrspace(1)* %in, i8 3
464
465  %x = load i8, i8 addrspace(1)* %in
466  %y = load i8, i8 addrspace(1)* %in.gep.1
467  %z = load i8, i8 addrspace(1)* %in.gep.2
468  %w = load i8, i8 addrspace(1)* %in.gep.3
469
470  store i8 %x, i8 addrspace(1)* %out
471  store i8 %y, i8 addrspace(1)* %out.gep.1
472  store i8 %z, i8 addrspace(1)* %out.gep.2
473  store i8 %w, i8 addrspace(1)* %out.gep.3
474  ret void
475}
476
477; This works once AA is enabled on the subtarget
478; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
479; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
480
481; GCN-NOAA: buffer_store_dword v
482; GCN-NOAA: buffer_store_dword v
483; GCN-NOAA: buffer_store_dword v
484; GCN-NOAA: buffer_store_dword v
485
486; GCN-AA: buffer_store_dwordx4 [[LOAD]]
487
488; GCN: s_endpgm
489define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
490  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
491  %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
492  %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
493  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %in
494
495  %x = extractelement <4 x i32> %vec, i32 0
496  %y = extractelement <4 x i32> %vec, i32 1
497  %z = extractelement <4 x i32> %vec, i32 2
498  %w = extractelement <4 x i32> %vec, i32 3
499
500  store i32 %x, i32 addrspace(1)* %out
501  store i32 %y, i32 addrspace(1)* %out.gep.1
502  store i32 %z, i32 addrspace(1)* %out.gep.2
503  store i32 %w, i32 addrspace(1)* %out.gep.3
504  ret void
505}
506
507; GCN-LABEL: {{^}}merge_local_store_2_constants_i8:
508; GCN: ds_write_b8
509; GCN: ds_write_b8
510; GCN: s_endpgm
511define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
512  %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
513
514  store i8 123, i8 addrspace(3)* %out.gep.1
515  store i8 456, i8 addrspace(3)* %out, align 2
516  ret void
517}
518
519; GCN-LABEL: {{^}}merge_local_store_2_constants_i32:
520; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
521; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
522; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
523define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
524  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
525
526  store i32 123, i32 addrspace(3)* %out.gep.1
527  store i32 456, i32 addrspace(3)* %out
528  ret void
529}
530
531; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
532; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
533; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
534; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
535
536; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
537; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
538; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
539
540; GCN: s_endpgm
541define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
542  %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
543  %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
544  %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
545
546  store i32 123, i32 addrspace(3)* %out.gep.1
547  store i32 456, i32 addrspace(3)* %out.gep.2
548  store i32 333, i32 addrspace(3)* %out.gep.3
549  store i32 1234, i32 addrspace(3)* %out
550  ret void
551}
552
553; GCN-LABEL: {{^}}merge_global_store_5_constants_i32:
554; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 9{{$}}
555; GCN-DAG: v_mov_b32_e32 v[[HI4:[0-9]+]], -12{{$}}
556; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
557; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
558; GCN: buffer_store_dword v[[HI]]
559define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
560  store i32 9, i32 addrspace(1)* %out, align 4
561  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
562  store i32 12, i32 addrspace(1)* %idx1, align 4
563  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
564  store i32 16, i32 addrspace(1)* %idx2, align 4
565  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
566  store i32 -12, i32 addrspace(1)* %idx3, align 4
567  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
568  store i32 11, i32 addrspace(1)* %idx4, align 4
569  ret void
570}
571
572; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
573; GCN: buffer_store_dwordx4
574; GCN: buffer_store_dwordx2
575define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
576  store i32 13, i32 addrspace(1)* %out, align 4
577  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
578  store i32 15, i32 addrspace(1)* %idx1, align 4
579  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
580  store i32 62, i32 addrspace(1)* %idx2, align 4
581  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
582  store i32 63, i32 addrspace(1)* %idx3, align 4
583  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
584  store i32 11, i32 addrspace(1)* %idx4, align 4
585  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
586  store i32 123, i32 addrspace(1)* %idx5, align 4
587  ret void
588}
589
590; GCN-LABEL: {{^}}merge_global_store_7_constants_i32:
591; GCN: buffer_store_dwordx4
592; GCN: buffer_store_dwordx2
593; GCN: buffer_store_dword v
594define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
595  store i32 34, i32 addrspace(1)* %out, align 4
596  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
597  store i32 999, i32 addrspace(1)* %idx1, align 4
598  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
599  store i32 65, i32 addrspace(1)* %idx2, align 4
600  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
601  store i32 33, i32 addrspace(1)* %idx3, align 4
602  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
603  store i32 98, i32 addrspace(1)* %idx4, align 4
604  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
605  store i32 91, i32 addrspace(1)* %idx5, align 4
606  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
607  store i32 212, i32 addrspace(1)* %idx6, align 4
608  ret void
609}
610
611; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
612; GCN: buffer_store_dwordx4
613; GCN: buffer_store_dwordx4
614; GCN: s_endpgm
615define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
616  store i32 34, i32 addrspace(1)* %out, align 4
617  %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
618  store i32 999, i32 addrspace(1)* %idx1, align 4
619  %idx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 2
620  store i32 65, i32 addrspace(1)* %idx2, align 4
621  %idx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 3
622  store i32 33, i32 addrspace(1)* %idx3, align 4
623  %idx4 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 4
624  store i32 98, i32 addrspace(1)* %idx4, align 4
625  %idx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 5
626  store i32 91, i32 addrspace(1)* %idx5, align 4
627  %idx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 6
628  store i32 212, i32 addrspace(1)* %idx6, align 4
629  %idx7 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 7
630  store i32 999, i32 addrspace(1)* %idx7, align 4
631  ret void
632}
633
634; This requires handling of scalar_to_vector for v2i64 to avoid
635; scratch usage.
636; FIXME: Should do single load and store
637
638; GCN-LABEL: {{^}}copy_v3i32_align4:
639; GCN-NOT: SCRATCH_RSRC_DWORD
640; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
641; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
642; GCN-NOT: offen
643; GCN: s_waitcnt vmcnt
644; GCN-NOT: offen
645; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
646; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
647
648; GCN: ScratchSize: 0{{$}}
649define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
650  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
651  store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
652  ret void
653}
654
655; GCN-LABEL: {{^}}copy_v3i64_align4:
656; GCN-NOT: SCRATCH_RSRC_DWORD
657; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
658; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
659; GCN-NOT: offen
660; GCN: s_waitcnt vmcnt
661; GCN-NOT: offen
662; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
663; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
664; GCN: ScratchSize: 0{{$}}
665define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
666  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
667  store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
668  ret void
669}
670
671; GCN-LABEL: {{^}}copy_v3f32_align4:
672; GCN-NOT: SCRATCH_RSRC_DWORD
673; GCN-DAG: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
674; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
675; GCN-NOT: offen
676; GCN: s_waitcnt vmcnt
677; GCN-NOT: offen
678; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
679; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
680; GCN: ScratchSize: 0{{$}}
681define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
682  %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
683  %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
684  store <3 x float> %fadd, <3 x float> addrspace(1)* %out
685  ret void
686}
687
688; GCN-LABEL: {{^}}copy_v3f64_align4:
689; GCN-NOT: SCRATCH_RSRC_DWORD
690; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
691; GCN-DAG: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
692; GCN-NOT: offen
693; GCN: s_waitcnt vmcnt
694; GCN-NOT: offen
695; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
696; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
697; GCN: ScratchSize: 0{{$}}
698define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
699  %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
700  %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
701  store <3 x double> %fadd, <3 x double> addrspace(1)* %out
702  ret void
703}
704
705declare void @llvm.amdgcn.s.barrier() #1
706
707attributes #0 = { nounwind }
708attributes #1 = { convergent nounwind }
709