1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
6
7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, i8 addrspace(3)* %in, align 1
19  store i8 %val, i8 addrspace(3)* %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
24; ALIGNED-LABEL: ds2align1:
25; ALIGNED:       ; %bb.0:
26; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
27; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-NEXT:    s_endpgm
37;
38; UNALIGNED-LABEL: ds2align1:
39; UNALIGNED:       ; %bb.0:
40; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
42; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
43; UNALIGNED-NEXT:    ds_read_u16 v0, v0
44; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
45; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
46; UNALIGNED-NEXT:    ds_write_b16 v1, v0
47; UNALIGNED-NEXT:    s_endpgm
48  %val = load i16, i16 addrspace(3)* %in, align 1
49  store i16 %val, i16 addrspace(3)* %out, align 1
50  ret void
51}
52
53define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
54; GCN-LABEL: ds2align2:
55; GCN:       ; %bb.0:
56; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
57; GCN-NEXT:    s_waitcnt lgkmcnt(0)
58; GCN-NEXT:    v_mov_b32_e32 v0, s0
59; GCN-NEXT:    ds_read_u16 v0, v0
60; GCN-NEXT:    v_mov_b32_e32 v1, s1
61; GCN-NEXT:    s_waitcnt lgkmcnt(0)
62; GCN-NEXT:    ds_write_b16 v1, v0
63; GCN-NEXT:    s_endpgm
64  %val = load i16, i16 addrspace(3)* %in, align 2
65  store i16 %val, i16 addrspace(3)* %out, align 2
66  ret void
67}
68
69define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
70; ALIGNED-LABEL: ds4align1:
71; ALIGNED:       ; %bb.0:
72; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
73; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
74; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
75; ALIGNED-NEXT:    ds_read_u8 v1, v0
76; ALIGNED-NEXT:    ds_read_u8 v2, v0 offset:1
77; ALIGNED-NEXT:    ds_read_u8 v3, v0 offset:2
78; ALIGNED-NEXT:    ds_read_u8 v0, v0 offset:3
79; ALIGNED-NEXT:    v_mov_b32_e32 v4, s1
80; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
81; ALIGNED-NEXT:    ds_write_b8 v4, v1
82; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
83; ALIGNED-NEXT:    ds_write_b8 v4, v2 offset:1
84; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
85; ALIGNED-NEXT:    ds_write_b8 v4, v3 offset:2
86; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
87; ALIGNED-NEXT:    ds_write_b8 v4, v0 offset:3
88; ALIGNED-NEXT:    s_endpgm
89;
90; UNALIGNED-LABEL: ds4align1:
91; UNALIGNED:       ; %bb.0:
92; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
93; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
94; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
95; UNALIGNED-NEXT:    ds_read_b32 v0, v0
96; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
97; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
98; UNALIGNED-NEXT:    ds_write_b32 v1, v0
99; UNALIGNED-NEXT:    s_endpgm
100  %val = load i32, i32 addrspace(3)* %in, align 1
101  store i32 %val, i32 addrspace(3)* %out, align 1
102  ret void
103}
104
105define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
106; ALIGNED-LABEL: ds4align2:
107; ALIGNED:       ; %bb.0:
108; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
110; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
111; ALIGNED-NEXT:    ds_read_u16 v1, v0
112; ALIGNED-NEXT:    ds_read_u16 v0, v0 offset:2
113; ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
114; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
115; ALIGNED-NEXT:    ds_write_b16 v2, v1
116; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
117; ALIGNED-NEXT:    ds_write_b16 v2, v0 offset:2
118; ALIGNED-NEXT:    s_endpgm
119;
120; UNALIGNED-LABEL: ds4align2:
121; UNALIGNED:       ; %bb.0:
122; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
123; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
124; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
125; UNALIGNED-NEXT:    ds_read_b32 v0, v0
126; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
127; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
128; UNALIGNED-NEXT:    ds_write_b32 v1, v0
129; UNALIGNED-NEXT:    s_endpgm
130  %val = load i32, i32 addrspace(3)* %in, align 2
131  store i32 %val, i32 addrspace(3)* %out, align 2
132  ret void
133}
134
135define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
136; GCN-LABEL: ds4align4:
137; GCN:       ; %bb.0:
138; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
139; GCN-NEXT:    s_waitcnt lgkmcnt(0)
140; GCN-NEXT:    v_mov_b32_e32 v0, s0
141; GCN-NEXT:    ds_read_b32 v0, v0
142; GCN-NEXT:    v_mov_b32_e32 v1, s1
143; GCN-NEXT:    s_waitcnt lgkmcnt(0)
144; GCN-NEXT:    ds_write_b32 v1, v0
145; GCN-NEXT:    s_endpgm
146  %val = load i32, i32 addrspace(3)* %in, align 4
147  store i32 %val, i32 addrspace(3)* %out, align 4
148  ret void
149}
150
151define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
152; ALIGNED-SDAG-LABEL: ds8align1:
153; ALIGNED-SDAG:       ; %bb.0:
154; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
156; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
157; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
158; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
159; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
160; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
161; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
162; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
163; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
164; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
165; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
166; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
167; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
168; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
169; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
170; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
171; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
172; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
173; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
174; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
175; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
176; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
177; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
178; ALIGNED-SDAG-NEXT:    s_endpgm
179;
180; ALIGNED-GISEL-LABEL: ds8align1:
181; ALIGNED-GISEL:       ; %bb.0:
182; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
183; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
184; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
185; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0
186; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:1
187; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:2
188; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:3
189; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:4
190; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:5
191; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:6
192; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
193; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, s1
194; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
195; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v2
196; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
197; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v3 offset:1
198; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
199; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v4 offset:2
200; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
201; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v5 offset:3
202; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
203; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v6 offset:4
204; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
205; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v7 offset:5
206; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
207; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v8 offset:6
208; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
209; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v0 offset:7
210; ALIGNED-GISEL-NEXT:    s_endpgm
211;
212; UNALIGNED-LABEL: ds8align1:
213; UNALIGNED:       ; %bb.0:
214; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
215; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
216; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
217; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
218; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
219; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
220; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
221; UNALIGNED-NEXT:    s_endpgm
222  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
223  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
224  ret void
225}
226
227define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
228; ALIGNED-SDAG-LABEL: ds8align2:
229; ALIGNED-SDAG:       ; %bb.0:
230; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
231; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
232; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
233; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:2
234; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
235; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:6
236; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:4
237; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
238; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
239; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:2
240; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
241; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
242; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
243; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:6
244; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
245; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:4
246; ALIGNED-SDAG-NEXT:    s_endpgm
247;
248; ALIGNED-GISEL-LABEL: ds8align2:
249; ALIGNED-GISEL:       ; %bb.0:
250; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
251; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
252; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
253; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
254; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
255; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
256; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
257; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
258; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
259; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
260; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
261; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:2
262; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
263; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:4
264; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
265; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:6
266; ALIGNED-GISEL-NEXT:    s_endpgm
267;
268; UNALIGNED-LABEL: ds8align2:
269; UNALIGNED:       ; %bb.0:
270; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
271; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
272; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
273; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
274; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
275; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
276; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
277; UNALIGNED-NEXT:    s_endpgm
278  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
279  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
280  ret void
281}
282
283define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
284; GCN-LABEL: ds8align4:
285; GCN:       ; %bb.0:
286; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
287; GCN-NEXT:    s_waitcnt lgkmcnt(0)
288; GCN-NEXT:    v_mov_b32_e32 v0, s0
289; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
290; GCN-NEXT:    v_mov_b32_e32 v2, s1
291; GCN-NEXT:    s_waitcnt lgkmcnt(0)
292; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
293; GCN-NEXT:    s_endpgm
294  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
295  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
296  ret void
297}
298
299define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
300; GCN-LABEL: ds8align8:
301; GCN:       ; %bb.0:
302; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
303; GCN-NEXT:    s_waitcnt lgkmcnt(0)
304; GCN-NEXT:    v_mov_b32_e32 v0, s0
305; GCN-NEXT:    ds_read_b64 v[0:1], v0
306; GCN-NEXT:    v_mov_b32_e32 v2, s1
307; GCN-NEXT:    s_waitcnt lgkmcnt(0)
308; GCN-NEXT:    ds_write_b64 v2, v[0:1]
309; GCN-NEXT:    s_endpgm
310  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
311  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
312  ret void
313}
314
315define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
316; ALIGNED-SDAG-LABEL: ds12align1:
317; ALIGNED-SDAG:       ; %bb.0:
318; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
319; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
320; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
321; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
322; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
323; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
324; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
325; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
326; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
327; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
328; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
329; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
330; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
331; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
332; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
333; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
334; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
335; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
336; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
337; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
338; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
339; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
340; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
341; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
342; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
343; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
344; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
345; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
346; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
347; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
348; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
349; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
350; ALIGNED-SDAG-NEXT:    s_endpgm
351;
352; ALIGNED-GISEL-LABEL: ds12align1:
353; ALIGNED-GISEL:       ; %bb.0:
354; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
355; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
356; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
357; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v2
358; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v2 offset:1
359; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v2 offset:2
360; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v2 offset:3
361; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v2 offset:4
362; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v2 offset:5
363; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v2 offset:6
364; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v2 offset:7
365; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
366; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 8, v0
367; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
368; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
369; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
370; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
371; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v3
372; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
373; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
374; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
375; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
376; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
377; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v8
378; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
379; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v2 offset:8
380; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v2 offset:9
381; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v2 offset:10
382; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v2 offset:11
383; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 8, v0
384; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
385; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v9, s1
386; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 24, v0
387; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v0
388; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v6 offset:1
389; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v7 offset:2
390; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v8 offset:3
391; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
392; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
393; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
394; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v1 offset:4
395; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v0 offset:5
396; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v6 offset:6
397; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v7 offset:7
398; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(11)
399; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v3 offset:8
400; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(11)
401; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v4 offset:9
402; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(11)
403; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v5 offset:10
404; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(11)
405; ALIGNED-GISEL-NEXT:    ds_write_b8 v9, v2 offset:11
406; ALIGNED-GISEL-NEXT:    s_endpgm
407;
408; UNALIGNED-LABEL: ds12align1:
409; UNALIGNED:       ; %bb.0:
410; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
411; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
412; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
413; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
414; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
415; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
416; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
417; UNALIGNED-NEXT:    s_endpgm
418  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
419  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
420  ret void
421}
422
423define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
424; ALIGNED-SDAG-LABEL: ds12align2:
425; ALIGNED-SDAG:       ; %bb.0:
426; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
427; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
428; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
429; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:8
430; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
431; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
432; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
433; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
434; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
435; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
436; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
437; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1 offset:8
438; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
439; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:2
440; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2
441; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
442; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:4
443; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
444; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:6
445; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
446; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:10
447; ALIGNED-SDAG-NEXT:    s_endpgm
448;
449; ALIGNED-GISEL-LABEL: ds12align2:
450; ALIGNED-GISEL:       ; %bb.0:
451; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
452; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
453; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
454; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
455; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
456; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
457; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
458; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:8
459; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:10
460; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
461; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
462; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
463; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
464; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
465; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
466; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v0
467; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v2 offset:2
468; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
469; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v1 offset:4
470; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v0 offset:6
471; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
472; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v6 offset:8
473; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
474; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v7 offset:10
475; ALIGNED-GISEL-NEXT:    s_endpgm
476;
477; UNALIGNED-LABEL: ds12align2:
478; UNALIGNED:       ; %bb.0:
479; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
480; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
481; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
482; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
483; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
484; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
485; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
486; UNALIGNED-NEXT:    s_endpgm
487  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
488  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
489  ret void
490}
491
492define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
493; ALIGNED-LABEL: ds12align4:
494; ALIGNED:       ; %bb.0:
495; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
496; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
497; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
498; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
499; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
500; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
501; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
502; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
503; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
504; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
505; ALIGNED-NEXT:    s_endpgm
506;
507; UNALIGNED-LABEL: ds12align4:
508; UNALIGNED:       ; %bb.0:
509; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
510; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
511; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
512; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
513; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
514; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
515; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
516; UNALIGNED-NEXT:    s_endpgm
517  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
518  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
519  ret void
520}
521
522; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
523define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
524; ALIGNED-SDAG-LABEL: ds12align8:
525; ALIGNED-SDAG:       ; %bb.0:
526; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
527; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
528; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
529; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v0 offset:8
530; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
531; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
532; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
533; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
534; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
535; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
536; ALIGNED-SDAG-NEXT:    s_endpgm
537;
538; ALIGNED-GISEL-LABEL: ds12align8:
539; ALIGNED-GISEL:       ; %bb.0:
540; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
541; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
542; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
543; ALIGNED-GISEL-NEXT:    ds_read_b64 v[0:1], v2
544; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
545; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
546; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
547; ALIGNED-GISEL-NEXT:    ds_write_b64 v3, v[0:1]
548; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
549; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
550; ALIGNED-GISEL-NEXT:    s_endpgm
551;
552; UNALIGNED-LABEL: ds12align8:
553; UNALIGNED:       ; %bb.0:
554; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
555; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
556; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
557; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
558; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
559; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
560; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
561; UNALIGNED-NEXT:    s_endpgm
562  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
563  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
564  ret void
565}
566
567define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
568; GCN-LABEL: ds12align16:
569; GCN:       ; %bb.0:
570; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
571; GCN-NEXT:    s_waitcnt lgkmcnt(0)
572; GCN-NEXT:    v_mov_b32_e32 v0, s0
573; GCN-NEXT:    ds_read_b96 v[0:2], v0
574; GCN-NEXT:    v_mov_b32_e32 v3, s1
575; GCN-NEXT:    s_waitcnt lgkmcnt(0)
576; GCN-NEXT:    ds_write_b96 v3, v[0:2]
577; GCN-NEXT:    s_endpgm
578  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
579  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
580  ret void
581}
582
583define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
584; ALIGNED-SDAG-LABEL: ds16align1:
585; ALIGNED-SDAG:       ; %bb.0:
586; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
587; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
588; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
589; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
590; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
591; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
592; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
593; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
594; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
595; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
596; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
597; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
598; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
599; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
600; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
601; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
602; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
603; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
604; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
605; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
606; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
607; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
608; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
609; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
610; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
611; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
612; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
613; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
614; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
615; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
616; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
617; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
618; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
619; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
620; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
621; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
622; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
623; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
624; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
625; ALIGNED-SDAG-NEXT:    s_endpgm
626;
627; ALIGNED-GISEL-LABEL: ds16align1:
628; ALIGNED-GISEL:       ; %bb.0:
629; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
630; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
631; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
632; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
633; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
634; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
635; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
636; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
637; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
638; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
639; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
640; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:8
641; ALIGNED-GISEL-NEXT:    ds_read_u8 v10, v0 offset:9
642; ALIGNED-GISEL-NEXT:    ds_read_u8 v11, v0 offset:10
643; ALIGNED-GISEL-NEXT:    ds_read_u8 v12, v0 offset:11
644; ALIGNED-GISEL-NEXT:    ds_read_u8 v13, v0 offset:12
645; ALIGNED-GISEL-NEXT:    ds_read_u8 v14, v0 offset:13
646; ALIGNED-GISEL-NEXT:    ds_read_u8 v15, v0 offset:14
647; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
648; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v16, s1
649; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
650; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v1
651; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v2 offset:1
652; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
653; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v3 offset:2
654; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v4 offset:3
655; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
656; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v5 offset:4
657; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v6 offset:5
658; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
659; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v7 offset:6
660; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v8 offset:7
661; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
662; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v9 offset:8
663; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v10 offset:9
664; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
665; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v11 offset:10
666; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v12 offset:11
667; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
668; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v13 offset:12
669; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v14 offset:13
670; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
671; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v15 offset:14
672; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v0 offset:15
673; ALIGNED-GISEL-NEXT:    s_endpgm
674;
675; UNALIGNED-LABEL: ds16align1:
676; UNALIGNED:       ; %bb.0:
677; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
678; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
679; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
680; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
681; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
682; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
683; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
684; UNALIGNED-NEXT:    s_endpgm
685  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
686  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
687  ret void
688}
689
690define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
691; ALIGNED-SDAG-LABEL: ds16align2:
692; ALIGNED-SDAG:       ; %bb.0:
693; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
694; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
695; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
696; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:12
697; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
698; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
699; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
700; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
701; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
702; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
703; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v8, s1
704; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
705; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
706; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v1 offset:12
707; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
708; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v3 offset:2
709; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v2
710; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
711; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v4 offset:4
712; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
713; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v6 offset:8
714; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v5 offset:6
715; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
716; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v7 offset:10
717; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
718; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v0 offset:14
719; ALIGNED-SDAG-NEXT:    s_endpgm
720;
721; ALIGNED-GISEL-LABEL: ds16align2:
722; ALIGNED-GISEL:       ; %bb.0:
723; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
724; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
725; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
726; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0
727; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:2
728; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:4
729; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:6
730; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:8
731; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:10
732; ALIGNED-GISEL-NEXT:    ds_read_u16 v8, v0 offset:12
733; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
734; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, s1
735; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
736; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v2
737; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
738; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v3 offset:2
739; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
740; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v4 offset:4
741; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
742; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v5 offset:6
743; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
744; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v6 offset:8
745; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
746; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v7 offset:10
747; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
748; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v8 offset:12
749; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
750; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v0 offset:14
751; ALIGNED-GISEL-NEXT:    s_endpgm
752;
753; UNALIGNED-LABEL: ds16align2:
754; UNALIGNED:       ; %bb.0:
755; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
756; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
757; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
758; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
759; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
760; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
761; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
762; UNALIGNED-NEXT:    s_endpgm
763  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
764  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
765  ret void
766}
767
768define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
769; ALIGNED-SDAG-LABEL: ds16align4:
770; ALIGNED-SDAG:       ; %bb.0:
771; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
772; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
773; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
774; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
775; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
776; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
777; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
778; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
779; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
780; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
781; ALIGNED-SDAG-NEXT:    s_endpgm
782;
783; ALIGNED-GISEL-LABEL: ds16align4:
784; ALIGNED-GISEL:       ; %bb.0:
785; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
786; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
787; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
788; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
789; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
790; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
791; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
792; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
793; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
794; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
795; ALIGNED-GISEL-NEXT:    s_endpgm
796;
797; UNALIGNED-LABEL: ds16align4:
798; UNALIGNED:       ; %bb.0:
799; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
800; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
801; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
802; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
803; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
804; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
805; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
806; UNALIGNED-NEXT:    s_endpgm
807  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
808  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
809  ret void
810}
811
812define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
813; GCN-LABEL: ds16align8:
814; GCN:       ; %bb.0:
815; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
816; GCN-NEXT:    s_waitcnt lgkmcnt(0)
817; GCN-NEXT:    v_mov_b32_e32 v0, s0
818; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
819; GCN-NEXT:    v_mov_b32_e32 v4, s1
820; GCN-NEXT:    s_waitcnt lgkmcnt(0)
821; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
822; GCN-NEXT:    s_endpgm
823  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
824  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
825  ret void
826}
827
828define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
829; GCN-LABEL: ds16align16:
830; GCN:       ; %bb.0:
831; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
832; GCN-NEXT:    s_waitcnt lgkmcnt(0)
833; GCN-NEXT:    v_mov_b32_e32 v0, s0
834; GCN-NEXT:    ds_read_b128 v[0:3], v0
835; GCN-NEXT:    v_mov_b32_e32 v4, s1
836; GCN-NEXT:    s_waitcnt lgkmcnt(0)
837; GCN-NEXT:    ds_write_b128 v4, v[0:3]
838; GCN-NEXT:    s_endpgm
839  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
840  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
841  ret void
842}
843