1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
6
7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, i8 addrspace(3)* %in, align 1
19  store i8 %val, i8 addrspace(3)* %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
24; ALIGNED-LABEL: ds2align1:
25; ALIGNED:       ; %bb.0:
26; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
27; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-NEXT:    s_endpgm
37;
38; UNALIGNED-LABEL: ds2align1:
39; UNALIGNED:       ; %bb.0:
40; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
42; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
43; UNALIGNED-NEXT:    ds_read_u16 v0, v0
44; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
45; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
46; UNALIGNED-NEXT:    ds_write_b16 v1, v0
47; UNALIGNED-NEXT:    s_endpgm
48  %val = load i16, i16 addrspace(3)* %in, align 1
49  store i16 %val, i16 addrspace(3)* %out, align 1
50  ret void
51}
52
53define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
54; GCN-LABEL: ds2align2:
55; GCN:       ; %bb.0:
56; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
57; GCN-NEXT:    s_waitcnt lgkmcnt(0)
58; GCN-NEXT:    v_mov_b32_e32 v0, s0
59; GCN-NEXT:    ds_read_u16 v0, v0
60; GCN-NEXT:    v_mov_b32_e32 v1, s1
61; GCN-NEXT:    s_waitcnt lgkmcnt(0)
62; GCN-NEXT:    ds_write_b16 v1, v0
63; GCN-NEXT:    s_endpgm
64  %val = load i16, i16 addrspace(3)* %in, align 2
65  store i16 %val, i16 addrspace(3)* %out, align 2
66  ret void
67}
68
69define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
70; ALIGNED-LABEL: ds4align1:
71; ALIGNED:       ; %bb.0:
72; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
73; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
74; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
75; ALIGNED-NEXT:    ds_read_u8 v1, v0
76; ALIGNED-NEXT:    ds_read_u8 v2, v0 offset:1
77; ALIGNED-NEXT:    ds_read_u8 v3, v0 offset:2
78; ALIGNED-NEXT:    ds_read_u8 v0, v0 offset:3
79; ALIGNED-NEXT:    v_mov_b32_e32 v4, s1
80; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
81; ALIGNED-NEXT:    ds_write_b8 v4, v1
82; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
83; ALIGNED-NEXT:    ds_write_b8 v4, v2 offset:1
84; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
85; ALIGNED-NEXT:    ds_write_b8 v4, v3 offset:2
86; ALIGNED-NEXT:    s_waitcnt lgkmcnt(3)
87; ALIGNED-NEXT:    ds_write_b8 v4, v0 offset:3
88; ALIGNED-NEXT:    s_endpgm
89;
90; UNALIGNED-LABEL: ds4align1:
91; UNALIGNED:       ; %bb.0:
92; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
93; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
94; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
95; UNALIGNED-NEXT:    ds_read_b32 v0, v0
96; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
97; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
98; UNALIGNED-NEXT:    ds_write_b32 v1, v0
99; UNALIGNED-NEXT:    s_endpgm
100  %val = load i32, i32 addrspace(3)* %in, align 1
101  store i32 %val, i32 addrspace(3)* %out, align 1
102  ret void
103}
104
105define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
106; ALIGNED-LABEL: ds4align2:
107; ALIGNED:       ; %bb.0:
108; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
109; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
110; ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
111; ALIGNED-NEXT:    ds_read_u16 v1, v0
112; ALIGNED-NEXT:    ds_read_u16 v0, v0 offset:2
113; ALIGNED-NEXT:    v_mov_b32_e32 v2, s1
114; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
115; ALIGNED-NEXT:    ds_write_b16 v2, v1
116; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
117; ALIGNED-NEXT:    ds_write_b16 v2, v0 offset:2
118; ALIGNED-NEXT:    s_endpgm
119;
120; UNALIGNED-LABEL: ds4align2:
121; UNALIGNED:       ; %bb.0:
122; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
123; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
124; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
125; UNALIGNED-NEXT:    ds_read_b32 v0, v0
126; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
127; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
128; UNALIGNED-NEXT:    ds_write_b32 v1, v0
129; UNALIGNED-NEXT:    s_endpgm
130  %val = load i32, i32 addrspace(3)* %in, align 2
131  store i32 %val, i32 addrspace(3)* %out, align 2
132  ret void
133}
134
135define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
136; GCN-LABEL: ds4align4:
137; GCN:       ; %bb.0:
138; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
139; GCN-NEXT:    s_waitcnt lgkmcnt(0)
140; GCN-NEXT:    v_mov_b32_e32 v0, s0
141; GCN-NEXT:    ds_read_b32 v0, v0
142; GCN-NEXT:    v_mov_b32_e32 v1, s1
143; GCN-NEXT:    s_waitcnt lgkmcnt(0)
144; GCN-NEXT:    ds_write_b32 v1, v0
145; GCN-NEXT:    s_endpgm
146  %val = load i32, i32 addrspace(3)* %in, align 4
147  store i32 %val, i32 addrspace(3)* %out, align 4
148  ret void
149}
150
151define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
152; ALIGNED-SDAG-LABEL: ds8align1:
153; ALIGNED-SDAG:       ; %bb.0:
154; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
156; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
157; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
158; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
159; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
160; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
161; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
162; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
163; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
164; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
165; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
166; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
167; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
168; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
169; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
170; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
171; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
172; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
173; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
174; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
175; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
176; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
177; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
178; ALIGNED-SDAG-NEXT:    s_endpgm
179;
180; ALIGNED-GISEL-LABEL: ds8align1:
181; ALIGNED-GISEL:       ; %bb.0:
182; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
183; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
184; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
185; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0
186; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:1
187; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:2
188; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:3
189; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:4
190; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:5
191; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:6
192; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
193; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, s1
194; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
195; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v2
196; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
197; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v3 offset:1
198; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
199; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v4 offset:2
200; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
201; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v5 offset:3
202; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
203; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v6 offset:4
204; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
205; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v7 offset:5
206; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
207; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v8 offset:6
208; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
209; ALIGNED-GISEL-NEXT:    ds_write_b8 v1, v0 offset:7
210; ALIGNED-GISEL-NEXT:    s_endpgm
211;
212; UNALIGNED-LABEL: ds8align1:
213; UNALIGNED:       ; %bb.0:
214; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
215; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
216; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
217; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
218; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
219; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
220; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
221; UNALIGNED-NEXT:    s_endpgm
222  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
223  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
224  ret void
225}
226
227define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
228; ALIGNED-SDAG-LABEL: ds8align2:
229; ALIGNED-SDAG:       ; %bb.0:
230; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
231; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
232; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
233; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
234; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0 offset:2
235; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:4
236; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:6
237; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
238; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
239; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2 offset:2
240; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1
241; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(2)
242; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:6
243; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:4
244; ALIGNED-SDAG-NEXT:    s_endpgm
245;
246; ALIGNED-GISEL-LABEL: ds8align2:
247; ALIGNED-GISEL:       ; %bb.0:
248; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
249; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
250; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
251; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
252; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
253; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
254; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
255; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
256; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
257; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
258; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
259; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:2
260; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
261; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:4
262; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(3)
263; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:6
264; ALIGNED-GISEL-NEXT:    s_endpgm
265;
266; UNALIGNED-LABEL: ds8align2:
267; UNALIGNED:       ; %bb.0:
268; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
269; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
270; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
271; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
272; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
273; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
274; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
275; UNALIGNED-NEXT:    s_endpgm
276  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
277  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
278  ret void
279}
280
281define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
282; GCN-LABEL: ds8align4:
283; GCN:       ; %bb.0:
284; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
285; GCN-NEXT:    s_waitcnt lgkmcnt(0)
286; GCN-NEXT:    v_mov_b32_e32 v0, s0
287; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
288; GCN-NEXT:    v_mov_b32_e32 v2, s1
289; GCN-NEXT:    s_waitcnt lgkmcnt(0)
290; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
291; GCN-NEXT:    s_endpgm
292  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
293  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
294  ret void
295}
296
297define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
298; GCN-LABEL: ds8align8:
299; GCN:       ; %bb.0:
300; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
301; GCN-NEXT:    s_waitcnt lgkmcnt(0)
302; GCN-NEXT:    v_mov_b32_e32 v0, s0
303; GCN-NEXT:    ds_read_b64 v[0:1], v0
304; GCN-NEXT:    v_mov_b32_e32 v2, s1
305; GCN-NEXT:    s_waitcnt lgkmcnt(0)
306; GCN-NEXT:    ds_write_b64 v2, v[0:1]
307; GCN-NEXT:    s_endpgm
308  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
309  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
310  ret void
311}
312
313define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
314; ALIGNED-SDAG-LABEL: ds12align1:
315; ALIGNED-SDAG:       ; %bb.0:
316; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
317; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
318; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
319; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
320; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
321; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
322; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
323; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
324; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
325; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
326; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
327; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
328; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
329; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
330; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
331; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
332; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
333; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
334; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
335; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
336; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
337; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
338; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
339; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
340; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
341; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
342; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
343; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
344; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
345; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
346; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
347; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
348; ALIGNED-SDAG-NEXT:    s_endpgm
349;
350; ALIGNED-GISEL-LABEL: ds12align1:
351; ALIGNED-GISEL:       ; %bb.0:
352; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
353; ALIGNED-GISEL-NEXT:    s_mov_b32 s3, 8
354; ALIGNED-GISEL-NEXT:    s_movk_i32 s2, 0xff
355; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
356; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
357; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s0
358; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v3
359; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v3 offset:1
360; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v3 offset:2
361; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v3 offset:3
362; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v3 offset:4
363; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v3 offset:5
364; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v3 offset:6
365; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v3 offset:7
366; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
367; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
368; ALIGNED-GISEL-NEXT:    v_and_or_b32 v0, v0, s2, v1
369; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(5)
370; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v1, s2, v4
371; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
372; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v4, s2, v5
373; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
374; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
375; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v4
376; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
377; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v1, s3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
378; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
379; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v4, v8, v2
380; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
381; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v5, v9, v2
382; ALIGNED-GISEL-NEXT:    v_and_or_b32 v1, v6, s2, v1
383; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
384; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v5
385; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v1, v4, v5
386; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v3 offset:8
387; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v3 offset:9
388; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v3 offset:10
389; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v3 offset:11
390; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v7, 8
391; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
392; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_sdwa v5, v7, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
393; ALIGNED-GISEL-NEXT:    v_and_or_b32 v4, v4, v2, v5
394; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
395; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v5, v6, v2
396; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
397; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v2, v3, v2
398; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
399; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
400; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
401; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
402; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v4, v5, v2
403; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
404; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
405; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v0
406; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v3 offset:1
407; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v4 offset:2
408; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v5 offset:3
409; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
410; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
411; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
412; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:4
413; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v0 offset:5
414; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v3 offset:6
415; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v4 offset:7
416; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 8, v2
417; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
418; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
419; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:8
420; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v0 offset:9
421; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:10
422; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v3 offset:11
423; ALIGNED-GISEL-NEXT:    s_endpgm
424;
425; UNALIGNED-LABEL: ds12align1:
426; UNALIGNED:       ; %bb.0:
427; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
428; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
429; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
430; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
431; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
432; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
433; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
434; UNALIGNED-NEXT:    s_endpgm
435  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
436  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
437  ret void
438}
439
440define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
441; ALIGNED-SDAG-LABEL: ds12align2:
442; ALIGNED-SDAG:       ; %bb.0:
443; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
444; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
445; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
446; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
447; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
448; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
449; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
450; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
451; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
452; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
453; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
454; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v6 offset:8
455; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:2
456; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
457; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
458; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:6
459; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
460; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v0 offset:10
461; ALIGNED-SDAG-NEXT:    s_endpgm
462;
463; ALIGNED-GISEL-LABEL: ds12align2:
464; ALIGNED-GISEL:       ; %bb.0:
465; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
466; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 0xffff
467; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
468; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
469; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
470; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
471; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
472; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
473; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
474; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
475; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
476; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v0, s2, v2
477; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
478; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
479; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v2, s2, v4
480; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
481; ALIGNED-GISEL-NEXT:    v_and_or_b32 v0, v1, s2, v0
482; ALIGNED-GISEL-NEXT:    v_and_or_b32 v1, v3, s2, v2
483; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
484; ALIGNED-GISEL-NEXT:    v_and_b32_e32 v2, s2, v6
485; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
486; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
487; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
488; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0
489; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:2
490; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
491; ALIGNED-GISEL-NEXT:    v_and_or_b32 v2, v5, s2, v2
492; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1 offset:4
493; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:6
494; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
495; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:8
496; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:10
497; ALIGNED-GISEL-NEXT:    s_endpgm
498;
499; UNALIGNED-LABEL: ds12align2:
500; UNALIGNED:       ; %bb.0:
501; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
502; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
503; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
504; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
505; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
506; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
507; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
508; UNALIGNED-NEXT:    s_endpgm
509  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
510  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
511  ret void
512}
513
514define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
515; ALIGNED-LABEL: ds12align4:
516; ALIGNED:       ; %bb.0:
517; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
518; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
519; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
520; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
521; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
522; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
523; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
524; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
525; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
526; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
527; ALIGNED-NEXT:    s_endpgm
528;
529; UNALIGNED-LABEL: ds12align4:
530; UNALIGNED:       ; %bb.0:
531; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
532; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
533; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
534; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
535; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
536; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
537; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
538; UNALIGNED-NEXT:    s_endpgm
539  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
540  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
541  ret void
542}
543
544; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
545define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
546; ALIGNED-SDAG-LABEL: ds12align8:
547; ALIGNED-SDAG:       ; %bb.0:
548; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
549; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
550; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
551; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
552; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v2 offset:8
553; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
554; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
555; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
556; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
557; ALIGNED-SDAG-NEXT:    s_endpgm
558;
559; ALIGNED-GISEL-LABEL: ds12align8:
560; ALIGNED-GISEL:       ; %bb.0:
561; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
562; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
563; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
564; ALIGNED-GISEL-NEXT:    ds_read_b64 v[0:1], v2
565; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
566; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
567; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
568; ALIGNED-GISEL-NEXT:    ds_write_b64 v3, v[0:1]
569; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
570; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
571; ALIGNED-GISEL-NEXT:    s_endpgm
572;
573; UNALIGNED-LABEL: ds12align8:
574; UNALIGNED:       ; %bb.0:
575; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
576; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
577; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
578; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
579; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
580; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
581; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
582; UNALIGNED-NEXT:    s_endpgm
583  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
584  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
585  ret void
586}
587
588define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
589; GCN-LABEL: ds12align16:
590; GCN:       ; %bb.0:
591; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
592; GCN-NEXT:    s_waitcnt lgkmcnt(0)
593; GCN-NEXT:    v_mov_b32_e32 v0, s0
594; GCN-NEXT:    ds_read_b96 v[0:2], v0
595; GCN-NEXT:    v_mov_b32_e32 v3, s1
596; GCN-NEXT:    s_waitcnt lgkmcnt(0)
597; GCN-NEXT:    ds_write_b96 v3, v[0:2]
598; GCN-NEXT:    s_endpgm
599  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
600  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
601  ret void
602}
603
604define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
605; ALIGNED-SDAG-LABEL: ds16align1:
606; ALIGNED-SDAG:       ; %bb.0:
607; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
608; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
609; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
610; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
611; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
612; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
613; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
614; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
615; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
616; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
617; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
618; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
619; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
620; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
621; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
622; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
623; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
624; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
625; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
626; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
627; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
628; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
629; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
630; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
631; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
632; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
633; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
634; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
635; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
636; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
637; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
638; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
639; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
640; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
641; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
642; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
643; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
644; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
645; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
646; ALIGNED-SDAG-NEXT:    s_endpgm
647;
648; ALIGNED-GISEL-LABEL: ds16align1:
649; ALIGNED-GISEL:       ; %bb.0:
650; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
651; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
652; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
653; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
654; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
655; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
656; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
657; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
658; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
659; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
660; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
661; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:8
662; ALIGNED-GISEL-NEXT:    ds_read_u8 v10, v0 offset:9
663; ALIGNED-GISEL-NEXT:    ds_read_u8 v11, v0 offset:10
664; ALIGNED-GISEL-NEXT:    ds_read_u8 v12, v0 offset:11
665; ALIGNED-GISEL-NEXT:    ds_read_u8 v13, v0 offset:12
666; ALIGNED-GISEL-NEXT:    ds_read_u8 v14, v0 offset:13
667; ALIGNED-GISEL-NEXT:    ds_read_u8 v15, v0 offset:14
668; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
669; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v16, s1
670; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
671; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v1
672; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v2 offset:1
673; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
674; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v3 offset:2
675; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v4 offset:3
676; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
677; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v5 offset:4
678; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v6 offset:5
679; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
680; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v7 offset:6
681; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v8 offset:7
682; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
683; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v9 offset:8
684; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v10 offset:9
685; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
686; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v11 offset:10
687; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v12 offset:11
688; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
689; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v13 offset:12
690; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v14 offset:13
691; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(14)
692; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v15 offset:14
693; ALIGNED-GISEL-NEXT:    ds_write_b8 v16, v0 offset:15
694; ALIGNED-GISEL-NEXT:    s_endpgm
695;
696; UNALIGNED-LABEL: ds16align1:
697; UNALIGNED:       ; %bb.0:
698; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
699; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
700; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
701; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
702; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
703; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
704; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
705; UNALIGNED-NEXT:    s_endpgm
706  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
707  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
708  ret void
709}
710
711define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
712; ALIGNED-SDAG-LABEL: ds16align2:
713; ALIGNED-SDAG:       ; %bb.0:
714; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
715; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
716; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
717; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
718; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
719; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
720; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
721; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
722; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
723; ALIGNED-SDAG-NEXT:    ds_read_u16 v8, v0 offset:12
724; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
725; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
726; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
727; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v8 offset:12
728; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:2
729; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
730; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
731; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v6 offset:8
732; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:6
733; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v7 offset:10
734; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
735; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v0 offset:14
736; ALIGNED-SDAG-NEXT:    s_endpgm
737;
738; ALIGNED-GISEL-LABEL: ds16align2:
739; ALIGNED-GISEL:       ; %bb.0:
740; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
741; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
742; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
743; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0
744; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:2
745; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:4
746; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:6
747; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:8
748; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:10
749; ALIGNED-GISEL-NEXT:    ds_read_u16 v8, v0 offset:12
750; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
751; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, s1
752; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
753; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v2
754; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
755; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v3 offset:2
756; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
757; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v4 offset:4
758; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
759; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v5 offset:6
760; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
761; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v6 offset:8
762; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
763; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v7 offset:10
764; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
765; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v8 offset:12
766; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(7)
767; ALIGNED-GISEL-NEXT:    ds_write_b16 v1, v0 offset:14
768; ALIGNED-GISEL-NEXT:    s_endpgm
769;
770; UNALIGNED-LABEL: ds16align2:
771; UNALIGNED:       ; %bb.0:
772; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
773; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
774; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
775; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
776; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
777; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
778; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
779; UNALIGNED-NEXT:    s_endpgm
780  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
781  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
782  ret void
783}
784
785define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
786; ALIGNED-SDAG-LABEL: ds16align4:
787; ALIGNED-SDAG:       ; %bb.0:
788; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
789; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
790; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
791; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
792; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
793; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
794; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
795; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
796; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
797; ALIGNED-SDAG-NEXT:    s_endpgm
798;
799; ALIGNED-GISEL-LABEL: ds16align4:
800; ALIGNED-GISEL:       ; %bb.0:
801; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
802; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
803; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
804; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
805; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
806; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
807; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
808; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
809; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
810; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
811; ALIGNED-GISEL-NEXT:    s_endpgm
812;
813; UNALIGNED-LABEL: ds16align4:
814; UNALIGNED:       ; %bb.0:
815; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
816; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
817; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
818; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
819; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
820; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
821; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
822; UNALIGNED-NEXT:    s_endpgm
823  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
824  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
825  ret void
826}
827
828define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
829; GCN-LABEL: ds16align8:
830; GCN:       ; %bb.0:
831; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
832; GCN-NEXT:    s_waitcnt lgkmcnt(0)
833; GCN-NEXT:    v_mov_b32_e32 v0, s0
834; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
835; GCN-NEXT:    v_mov_b32_e32 v4, s1
836; GCN-NEXT:    s_waitcnt lgkmcnt(0)
837; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
838; GCN-NEXT:    s_endpgm
839  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
840  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
841  ret void
842}
843
844define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
845; GCN-LABEL: ds16align16:
846; GCN:       ; %bb.0:
847; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
848; GCN-NEXT:    s_waitcnt lgkmcnt(0)
849; GCN-NEXT:    v_mov_b32_e32 v0, s0
850; GCN-NEXT:    ds_read_b128 v[0:3], v0
851; GCN-NEXT:    v_mov_b32_e32 v4, s1
852; GCN-NEXT:    s_waitcnt lgkmcnt(0)
853; GCN-NEXT:    ds_write_b128 v4, v[0:3]
854; GCN-NEXT:    s_endpgm
855  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
856  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
857  ret void
858}
859