1; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
2; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-access-mode -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,UNALIGNED %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefixes=SI,MUBUF,ALIGNED %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FLATSCR,ALIGNED %s
5
6; SI-LABEL: {{^}}local_unaligned_load_store_i16:
7; SI: ds_read_u8
8; SI: ds_read_u8
9; SI: ds_write_b8
10; SI: ds_write_b8
11; SI: s_endpgm
12define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
13  %v = load i16, i16 addrspace(3)* %p, align 1
14  store i16 %v, i16 addrspace(3)* %r, align 1
15  ret void
16}
17
18; SI-LABEL: {{^}}global_unaligned_load_store_i16:
19; ALIGNED: buffer_load_ubyte
20; ALIGNED: buffer_load_ubyte
21; ALIGNED: buffer_store_byte
22; ALIGNED: buffer_store_byte
23
24; UNALIGNED: buffer_load_ushort
25; UNALIGNED: buffer_store_short
26; SI: s_endpgm
27define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
28  %v = load i16, i16 addrspace(1)* %p, align 1
29  store i16 %v, i16 addrspace(1)* %r, align 1
30  ret void
31}
32
33; SI-LABEL: {{^}}local_unaligned_load_store_i32:
34
35; SI: ds_read_u8
36; SI: ds_read_u8
37; SI: ds_read_u8
38; SI: ds_read_u8
39; SI-NOT: v_or
40; SI-NOT: v_lshl
41; SI: ds_write_b8
42; SI: ds_write_b8
43; SI: ds_write_b8
44; SI: ds_write_b8
45; SI: s_endpgm
46define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
47  %v = load i32, i32 addrspace(3)* %p, align 1
48  store i32 %v, i32 addrspace(3)* %r, align 1
49  ret void
50}
51
52; SI-LABEL: {{^}}global_unaligned_load_store_i32:
53; ALIGNED: buffer_load_ubyte
54; ALIGNED: buffer_load_ubyte
55; ALIGNED: buffer_load_ubyte
56; ALIGNED: buffer_load_ubyte
57; ALIGNED: buffer_store_byte
58; ALIGNED: buffer_store_byte
59; ALIGNED: buffer_store_byte
60; ALIGNED: buffer_store_byte
61
62; UNALIGNED: buffer_load_dword
63; UNALIGNED: buffer_store_dword
64define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
65  %v = load i32, i32 addrspace(1)* %p, align 1
66  store i32 %v, i32 addrspace(1)* %r, align 1
67  ret void
68}
69
70; SI-LABEL: {{^}}global_align2_load_store_i32:
71; ALIGNED: buffer_load_ushort
72; ALIGNED: buffer_load_ushort
73; ALIGNED: buffer_store_short
74; ALIGNED: buffer_store_short
75
76; UNALIGNED: buffer_load_dword
77; UNALIGNED: buffer_store_dword
78define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
79  %v = load i32, i32 addrspace(1)* %p, align 2
80  store i32 %v, i32 addrspace(1)* %r, align 2
81  ret void
82}
83
84; GCN-LABEL: {{^}}local_align2_load_store_i32:
85; GCN: ds_read_u16
86; GCN: ds_read_u16
87; GCN: ds_write_b16
88; GCN: ds_write_b16
89define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
90  %v = load i32, i32 addrspace(3)* %p, align 2
91  store i32 %v, i32 addrspace(3)* %r, align 2
92  ret void
93}
94
95; SI-LABEL: {{^}}local_unaligned_load_store_i64:
96; SI: ds_read_u8
97; SI: ds_read_u8
98; SI: ds_read_u8
99; SI: ds_read_u8
100; SI: ds_read_u8
101; SI: ds_read_u8
102; SI: ds_read_u8
103; SI: ds_read_u8
104
105; SI-NOT: v_or_b32
106; SI-NOT: v_lshl
107; SI: ds_write_b8
108; SI-NOT: v_or_b32
109; SI-NOT: v_lshl
110
111; SI: ds_write_b8
112; SI-NOT: v_or_b32
113; SI-NOT: v_lshl
114
115; SI: ds_write_b8
116; SI-NOT: v_or_b32
117; SI-NOT: v_lshl
118
119; SI: ds_write_b8
120; SI-NOT: v_or_b32
121; SI-NOT: v_lshl
122
123; SI: ds_write_b8
124; SI-NOT: v_or_b32
125; SI-NOT: v_lshl
126
127; SI: ds_write_b8
128; SI-NOT: v_or_b32
129; SI-NOT: v_lshl
130
131; SI: ds_write_b8
132; SI-NOT: v_or_b32
133; SI-NOT: v_lshl
134; SI: ds_write_b8
135; SI: s_endpgm
136define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
137  %v = load i64, i64 addrspace(3)* %p, align 1
138  store i64 %v, i64 addrspace(3)* %r, align 1
139  ret void
140}
141
142; SI-LABEL: {{^}}local_unaligned_load_store_v2i32:
143; SI: ds_read_u8
144; SI: ds_read_u8
145; SI: ds_read_u8
146; SI: ds_read_u8
147; SI: ds_read_u8
148; SI: ds_read_u8
149; SI: ds_read_u8
150; SI: ds_read_u8
151
152; SI-NOT: v_or_b32
153; SI-NOT: v_lshl
154; SI: ds_write_b8
155; SI-NOT: v_or_b32
156; SI-NOT: v_lshl
157
158; SI: ds_write_b8
159; SI-NOT: v_or_b32
160; SI-NOT: v_lshl
161
162; SI: ds_write_b8
163; SI-NOT: v_or_b32
164; SI-NOT: v_lshl
165
166; SI: ds_write_b8
167; SI-NOT: v_or_b32
168; SI-NOT: v_lshl
169
170; SI: ds_write_b8
171; SI-NOT: v_or_b32
172; SI-NOT: v_lshl
173
174; SI: ds_write_b8
175; SI-NOT: v_or_b32
176; SI-NOT: v_lshl
177
178; SI: ds_write_b8
179; SI-NOT: v_or_b32
180; SI-NOT: v_lshl
181; SI: ds_write_b8
182; SI: s_endpgm
183define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
184  %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
185  store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
186  ret void
187}
188
189; SI-LABEL: {{^}}global_align2_load_store_i64:
190; ALIGNED: buffer_load_ushort
191; ALIGNED: buffer_load_ushort
192
193; ALIGNED-NOT: v_or_
194; ALIGNED-NOT: v_lshl
195
196; ALIGNED: buffer_load_ushort
197
198; ALIGNED-NOT: v_or_
199; ALIGNED-NOT: v_lshl
200
201; ALIGNED: buffer_load_ushort
202
203; ALIGNED-NOT: v_or_
204; ALIGNED-NOT: v_lshl
205
206; ALIGNED: buffer_store_short
207; ALIGNED: buffer_store_short
208; ALIGNED: buffer_store_short
209; ALIGNED: buffer_store_short
210
211; UNALIGNED: buffer_load_dwordx2
212; UNALIGNED: buffer_store_dwordx2
213define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
214  %v = load i64, i64 addrspace(1)* %p, align 2
215  store i64 %v, i64 addrspace(1)* %r, align 2
216  ret void
217}
218
219; SI-LABEL: {{^}}unaligned_load_store_i64_global:
220; ALIGNED: buffer_load_ubyte
221; ALIGNED: buffer_load_ubyte
222; ALIGNED: buffer_load_ubyte
223; ALIGNED: buffer_load_ubyte
224; ALIGNED: buffer_load_ubyte
225; ALIGNED: buffer_load_ubyte
226; ALIGNED: buffer_load_ubyte
227; ALIGNED: buffer_load_ubyte
228
229; ALIGNED-NOT: v_or_
230; ALIGNED-NOT: v_lshl
231
232; ALIGNED: buffer_store_byte
233; ALIGNED: buffer_store_byte
234; ALIGNED: buffer_store_byte
235; ALIGNED: buffer_store_byte
236; ALIGNED: buffer_store_byte
237; ALIGNED: buffer_store_byte
238; ALIGNED: buffer_store_byte
239; ALIGNED: buffer_store_byte
240
241; UNALIGNED: buffer_load_dwordx2
242; UNALIGNED: buffer_store_dwordx2
243define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
244  %v = load i64, i64 addrspace(1)* %p, align 1
245  store i64 %v, i64 addrspace(1)* %r, align 1
246  ret void
247}
248
249; GCN-LABEL: {{^}}local_unaligned_load_store_v4i32:
250; GCN: ds_read_u8
251; GCN: ds_read_u8
252; GCN: ds_read_u8
253; GCN: ds_read_u8
254
255; GCN: ds_read_u8
256; GCN: ds_read_u8
257; GCN: ds_read_u8
258; GCN: ds_read_u8
259
260; GCN: ds_read_u8
261; GCN: ds_read_u8
262; GCN: ds_read_u8
263; GCN: ds_read_u8
264
265; GCN: ds_read_u8
266; GCN: ds_read_u8
267; GCN: ds_read_u8
268; GCN: ds_read_u8
269
270; GCN: ds_write_b8
271; GCN: ds_write_b8
272; GCN: ds_write_b8
273; GCN: ds_write_b8
274
275; GCN: ds_write_b8
276; GCN: ds_write_b8
277; GCN: ds_write_b8
278; GCN: ds_write_b8
279
280; GCN: ds_write_b8
281; GCN: ds_write_b8
282; GCN: ds_write_b8
283; GCN: ds_write_b8
284
285; GCN: ds_write_b8
286; GCN: ds_write_b8
287; GCN: ds_write_b8
288; GCN: ds_write_b8
289; GCN: s_endpgm
290define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
291  %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
292  store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
293  ret void
294}
295
296; SI-LABEL: {{^}}global_unaligned_load_store_v4i32
297; ALIGNED: buffer_load_ubyte
298; ALIGNED: buffer_load_ubyte
299; ALIGNED: buffer_load_ubyte
300; ALIGNED: buffer_load_ubyte
301; ALIGNED: buffer_load_ubyte
302; ALIGNED: buffer_load_ubyte
303; ALIGNED: buffer_load_ubyte
304; ALIGNED: buffer_load_ubyte
305; ALIGNED: buffer_load_ubyte
306; ALIGNED: buffer_load_ubyte
307; ALIGNED: buffer_load_ubyte
308; ALIGNED: buffer_load_ubyte
309; ALIGNED: buffer_load_ubyte
310; ALIGNED: buffer_load_ubyte
311; ALIGNED: buffer_load_ubyte
312; ALIGNED: buffer_load_ubyte
313
314; ALIGNED: buffer_store_byte
315; ALIGNED: buffer_store_byte
316; ALIGNED: buffer_store_byte
317; ALIGNED: buffer_store_byte
318; ALIGNED: buffer_store_byte
319; ALIGNED: buffer_store_byte
320; ALIGNED: buffer_store_byte
321; ALIGNED: buffer_store_byte
322; ALIGNED: buffer_store_byte
323; ALIGNED: buffer_store_byte
324; ALIGNED: buffer_store_byte
325; ALIGNED: buffer_store_byte
326; ALIGNED: buffer_store_byte
327; ALIGNED: buffer_store_byte
328; ALIGNED: buffer_store_byte
329; ALIGNED: buffer_store_byte
330
331; UNALIGNED: buffer_load_dwordx4
332; UNALIGNED: buffer_store_dwordx4
333define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
334  %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
335  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
336  ret void
337}
338
339; GCN-LABEL: {{^}}local_load_i64_align_4:
340; GCN: ds_read2_b32
341define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
342  %val = load i64, i64 addrspace(3)* %in, align 4
343  store i64 %val, i64 addrspace(1)* %out, align 8
344  ret void
345}
346
347; GCN-LABEL: {{^}}local_load_i64_align_4_with_offset
348; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
349define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
350  %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
351  %val = load i64, i64 addrspace(3)* %ptr, align 4
352  store i64 %val, i64 addrspace(1)* %out, align 8
353  ret void
354}
355
356; GCN-LABEL: {{^}}local_load_i64_align_4_with_split_offset:
357; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
358; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
359; GCN: s_endpgm
360define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
361  %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
362  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
363  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
364  %val = load i64, i64 addrspace(3)* %ptri64, align 4
365  store i64 %val, i64 addrspace(1)* %out, align 8
366  ret void
367}
368
369; GCN-LABEL: {{^}}local_load_i64_align_1:
370; GCN: ds_read_u8
371; GCN: ds_read_u8
372; GCN: ds_read_u8
373; GCN: ds_read_u8
374; GCN: ds_read_u8
375; GCN: ds_read_u8
376; GCN: ds_read_u8
377; GCN: ds_read_u8
378; GCN: store_dwordx2
379define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
380  %val = load i64, i64 addrspace(3)* %in, align 1
381  store i64 %val, i64 addrspace(1)* %out, align 8
382  ret void
383}
384
385; GCN-LABEL: {{^}}local_store_i64_align_4:
386; GCN: ds_write2_b32
387define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
388  store i64 %val, i64 addrspace(3)* %out, align 4
389  ret void
390}
391
392; GCN-LABEL: {{^}}local_store_i64_align_4_with_offset
393; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
394; GCN: s_endpgm
395define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
396  %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
397  store i64 0, i64 addrspace(3)* %ptr, align 4
398  ret void
399}
400
401; GCN-LABEL: {{^}}local_store_i64_align_4_with_split_offset:
402; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
403; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
404; GCN: s_endpgm
405define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
406  %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
407  %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
408  %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
409  store i64 0, i64 addrspace(3)* %out, align 4
410  ret void
411}
412
413; SI-LABEL: {{^}}constant_unaligned_load_i32:
414; ALIGNED: buffer_load_ubyte
415; ALIGNED: buffer_load_ubyte
416; ALIGNED: buffer_load_ubyte
417; ALIGNED: buffer_load_ubyte
418
419; UNALIGNED: s_load_dword
420
421; SI: buffer_store_dword
422define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
423  %v = load i32, i32 addrspace(4)* %p, align 1
424  store i32 %v, i32 addrspace(1)* %r, align 4
425  ret void
426}
427
428; SI-LABEL: {{^}}constant_align2_load_i32:
429; ALIGNED: buffer_load_ushort
430; ALIGNED: buffer_load_ushort
431
432; UNALIGNED: s_load_dword
433; UNALIGNED: buffer_store_dword
434define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
435  %v = load i32, i32 addrspace(4)* %p, align 2
436  store i32 %v, i32 addrspace(1)* %r, align 4
437  ret void
438}
439
440; SI-LABEL: {{^}}constant_align2_load_i64:
441; ALIGNED: buffer_load_ushort
442; ALIGNED: buffer_load_ushort
443; ALIGNED: buffer_load_ushort
444; ALIGNED: buffer_load_ushort
445
446; UNALIGNED: s_load_dwordx4
447; UNALIGNED: buffer_store_dwordx2
448define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
449  %v = load i64, i64 addrspace(4)* %p, align 2
450  store i64 %v, i64 addrspace(1)* %r, align 4
451  ret void
452}
453
454; SI-LABEL: {{^}}constant_align4_load_i64:
455; SI: s_load_dwordx2
456; SI: buffer_store_dwordx2
457define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(4)* %p, i64 addrspace(1)* %r) #0 {
458  %v = load i64, i64 addrspace(4)* %p, align 4
459  store i64 %v, i64 addrspace(1)* %r, align 4
460  ret void
461}
462
463; SI-LABEL: {{^}}constant_align4_load_v4i32:
464; SI: s_load_dwordx4
465; SI: buffer_store_dwordx4
466define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
467  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 4
468  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
469  ret void
470}
471
472; SI-LABEL: {{^}}constant_unaligned_load_v2i32:
473; ALIGNED: buffer_load_ubyte
474; ALIGNED: buffer_load_ubyte
475; ALIGNED: buffer_load_ubyte
476; ALIGNED: buffer_load_ubyte
477
478; ALIGNED: buffer_load_ubyte
479; ALIGNED: buffer_load_ubyte
480; ALIGNED: buffer_load_ubyte
481; ALIGNED: buffer_load_ubyte
482
483; UNALIGNED: buffer_load_dwordx2
484
485; SI: buffer_store_dwordx2
486define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(4)* %p, <2 x i32> addrspace(1)* %r) #0 {
487  %v = load <2 x i32>, <2 x i32> addrspace(4)* %p, align 1
488  store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
489  ret void
490}
491
492; SI-LABEL: {{^}}constant_unaligned_load_v4i32:
493; ALIGNED: buffer_load_ubyte
494; ALIGNED: buffer_load_ubyte
495; ALIGNED: buffer_load_ubyte
496; ALIGNED: buffer_load_ubyte
497
498; ALIGNED: buffer_load_ubyte
499; ALIGNED: buffer_load_ubyte
500; ALIGNED: buffer_load_ubyte
501; ALIGNED: buffer_load_ubyte
502
503; ALIGNED: buffer_load_ubyte
504; ALIGNED: buffer_load_ubyte
505; ALIGNED: buffer_load_ubyte
506; ALIGNED: buffer_load_ubyte
507
508; ALIGNED: buffer_load_ubyte
509; ALIGNED: buffer_load_ubyte
510; ALIGNED: buffer_load_ubyte
511; ALIGNED: buffer_load_ubyte
512
513; UNALIGNED: buffer_load_dwordx4
514
515; SI: buffer_store_dwordx4
516define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(4)* %p, <4 x i32> addrspace(1)* %r) #0 {
517  %v = load <4 x i32>, <4 x i32> addrspace(4)* %p, align 1
518  store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
519  ret void
520}
521
522; SI-LABEL: {{^}}constant_align4_load_i8:
523; SI: s_load_dword
524; SI: buffer_store_byte
525define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
526  %v = load i8, i8 addrspace(4)* %p, align 4
527  store i8 %v, i8 addrspace(1)* %r, align 4
528  ret void
529}
530
531; SI-LABEL: {{^}}constant_align2_load_i8:
532; SI: buffer_load_ubyte
533; SI: buffer_store_byte
534define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(4)* %p, i8 addrspace(1)* %r) #0 {
535  %v = load i8, i8 addrspace(4)* %p, align 2
536  store i8 %v, i8 addrspace(1)* %r, align 2
537  ret void
538}
539
540; SI-LABEL: {{^}}constant_align4_merge_load_2_i32:
541; SI: s_load_dwordx2 s[[[LO:[0-9]+]]:[[HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
542; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
543; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
544; SI: buffer_store_dwordx2 v[[[VLO]]:[[VHI]]]
545define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(4)* %p, i32 addrspace(1)* %r) #0 {
546  %gep0 = getelementptr i32, i32 addrspace(4)* %p, i64 1
547  %v0 = load i32, i32 addrspace(4)* %p, align 4
548  %v1 = load i32, i32 addrspace(4)* %gep0, align 4
549
550  %gep1 = getelementptr i32, i32 addrspace(1)* %r, i64 1
551  store i32 %v0, i32 addrspace(1)* %r, align 4
552  store i32 %v1, i32 addrspace(1)* %gep1, align 4
553  ret void
554}
555
556; SI-LABEL: {{^}}local_load_align1_v16i8:
557; SI: ds_read_u8
558; SI: ds_read_u8
559; SI: ds_read_u8
560; SI: ds_read_u8
561; SI: ds_read_u8
562; SI: ds_read_u8
563; SI: ds_read_u8
564; SI: ds_read_u8
565; SI: ds_read_u8
566; SI: ds_read_u8
567; SI: ds_read_u8
568; SI: ds_read_u8
569; SI: ds_read_u8
570; SI: ds_read_u8
571; SI: ds_read_u8
572; SI: ds_read_u8
573
574; SI: ScratchSize: 0{{$}}
575define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
576  %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
577  store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
578  ret void
579}
580
581; SI-LABEL: {{^}}local_store_align1_v16i8:
582; SI: ds_write_b8
583; SI: ds_write_b8
584; SI: ds_write_b8
585; SI: ds_write_b8
586; SI: ds_write_b8
587; SI: ds_write_b8
588; SI: ds_write_b8
589; SI: ds_write_b8
590; SI: ds_write_b8
591; SI: ds_write_b8
592; SI: ds_write_b8
593; SI: ds_write_b8
594; SI: ds_write_b8
595; SI: ds_write_b8
596; SI: ds_write_b8
597; SI: ds_write_b8
598
599; SI: ScratchSize: 0{{$}}
600define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
601  store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
602  ret void
603}
604
605; SI-LABEL: {{^}}private_load_align1_f64:
606; MUBUF: buffer_load_ubyte
607; MUBUF: buffer_load_ubyte
608; MUBUF: buffer_load_ubyte
609; MUBUF: buffer_load_ubyte
610; MUBUF: buffer_load_ubyte
611; MUBUF: buffer_load_ubyte
612; MUBUF: buffer_load_ubyte
613; MUBUF: buffer_load_ubyte
614; FLATSCR: scratch_load_dwordx2
615define double @private_load_align1_f64(double addrspace(5)* %in) {
616  %x = load double, double addrspace(5)* %in, align 1
617  ret double %x
618}
619
620; SI-LABEL: {{^}}private_store_align1_f64:
621; MUBUF: buffer_store_byte
622; MUBUF: buffer_store_byte
623; MUBUF: buffer_store_byte
624; MUBUF: buffer_store_byte
625; MUBUF: buffer_store_byte
626; MUBUF: buffer_store_byte
627; MUBUF: buffer_store_byte
628; MUBUF: buffer_store_byte
629; FLATSCR: scratch_store_dwordx2
630define void @private_store_align1_f64(double addrspace(5)* %out, double %x) #0 {
631  store double %x, double addrspace(5)* %out, align 1
632  ret void
633}
634
635; SI-LABEL: {{^}}private_load_align4_f64:
636; MUBUF: buffer_load_dword
637; MUBUF: buffer_load_dword
638; FLATSCR: scratch_load_dwordx2
639define double @private_load_align4_f64(double addrspace(5)* %in) {
640  %x = load double, double addrspace(5)* %in, align 4
641  ret double %x
642}
643
644; SI-LABEL: {{^}}private_store_align4_f64:
645; MUBUF: buffer_store_dword
646; MUBUF: buffer_store_dword
647; FLATSCR: scratch_store_dwordx2
648define void @private_store_align4_f64(double addrspace(5)* %out, double %x) #0 {
649  store double %x, double addrspace(5)* %out, align 4
650  ret void
651}
652
653; SI-LABEL: {{^}}private_load_align2_f64:
654; MUBUF: buffer_load_ushort
655; MUBUF: buffer_load_ushort
656; MUBUF: buffer_load_ushort
657; MUBUF: buffer_load_ushort
658; FLATSCR: scratch_load_dwordx2
659define double @private_load_align2_f64(double addrspace(5)* %in) {
660  %x = load double, double addrspace(5)* %in, align 2
661  ret double %x
662}
663
664; SI-LABEL: {{^}}private_store_align2_f64:
665; MUBUF: buffer_store_short
666; MUBUF: buffer_store_short
667; MUBUF: buffer_store_short
668; MUBUF: buffer_store_short
669; FLATSCR: scratch_store_dwordx2
670define void @private_store_align2_f64(double addrspace(5)* %out, double %x) #0 {
671  store double %x, double addrspace(5)* %out, align 2
672  ret void
673}
674
675; Should not merge this to a dword store
676define amdgpu_kernel void @global_store_2xi16_align2(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
677  %gep.r = getelementptr i16, i16 addrspace(1)* %r, i64 1
678  %v = load i16, i16 addrspace(1)* %p, align 2
679  store i16 1, i16 addrspace(1)* %r, align 2
680  store i16 2, i16 addrspace(1)* %gep.r, align 2
681  ret void
682}
683
684; Should not merge this to a word load
685define i32 @load_2xi16_align2(i16 addrspace(1)* %p) #0 {
686  %gep.p = getelementptr i16, i16 addrspace(1)* %p, i64 1
687  %p.0 = load i16, i16 addrspace(1)* %p, align 2
688  %p.1 = load i16, i16 addrspace(1)* %gep.p, align 2
689  %zext.0 = zext i16 %p.0 to i32
690  %zext.1 = zext i16 %p.1 to i32
691  %shl.1 = shl i32 %zext.1, 16
692  %or = or i32 %zext.0, %shl.1
693  ret i32 %or
694}
695
696attributes #0 = { nounwind }
697