1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
6
7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, i8 addrspace(3)* %in, align 1
19  store i8 %val, i8 addrspace(3)* %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
24; ALIGNED-SDAG-LABEL: ds2align1:
25; ALIGNED-SDAG:       ; %bb.0:
26; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
27; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-SDAG-NEXT:    s_endpgm
37;
38; ALIGNED-GISEL-LABEL: ds2align1:
39; ALIGNED-GISEL:       ; %bb.0:
40; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
42; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
43; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
44; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:1
45; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
46; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
47; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
48; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
49; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v0
50; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v1 offset:1
51; ALIGNED-GISEL-NEXT:    s_endpgm
52;
53; UNALIGNED-LABEL: ds2align1:
54; UNALIGNED:       ; %bb.0:
55; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
56; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
57; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
58; UNALIGNED-NEXT:    ds_read_u16 v0, v0
59; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
60; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
61; UNALIGNED-NEXT:    ds_write_b16 v1, v0
62; UNALIGNED-NEXT:    s_endpgm
63  %val = load i16, i16 addrspace(3)* %in, align 1
64  store i16 %val, i16 addrspace(3)* %out, align 1
65  ret void
66}
67
68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
69; GCN-LABEL: ds2align2:
70; GCN:       ; %bb.0:
71; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
72; GCN-NEXT:    s_waitcnt lgkmcnt(0)
73; GCN-NEXT:    v_mov_b32_e32 v0, s0
74; GCN-NEXT:    ds_read_u16 v0, v0
75; GCN-NEXT:    v_mov_b32_e32 v1, s1
76; GCN-NEXT:    s_waitcnt lgkmcnt(0)
77; GCN-NEXT:    ds_write_b16 v1, v0
78; GCN-NEXT:    s_endpgm
79  %val = load i16, i16 addrspace(3)* %in, align 2
80  store i16 %val, i16 addrspace(3)* %out, align 2
81  ret void
82}
83
84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
85; ALIGNED-SDAG-LABEL: ds4align1:
86; ALIGNED-SDAG:       ; %bb.0:
87; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
88; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
89; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
90; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
91; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
92; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
93; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:3
94; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
95; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
96; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v1
97; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
98; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v2 offset:1
99; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
100; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v3 offset:2
101; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
102; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v0 offset:3
103; ALIGNED-SDAG-NEXT:    s_endpgm
104;
105; ALIGNED-GISEL-LABEL: ds4align1:
106; ALIGNED-GISEL:       ; %bb.0:
107; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
108; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
109; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
110; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
111; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
112; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:3
113; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:2
114; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
115; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
116; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
117; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
118; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
119; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
120; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
121; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
122; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
123; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
124; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0
125; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:1
126; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v0, 8, v1
127; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:2
128; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:3
129; ALIGNED-GISEL-NEXT:    s_endpgm
130;
131; UNALIGNED-LABEL: ds4align1:
132; UNALIGNED:       ; %bb.0:
133; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
134; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
135; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
136; UNALIGNED-NEXT:    ds_read_b32 v0, v0
137; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
138; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
139; UNALIGNED-NEXT:    ds_write_b32 v1, v0
140; UNALIGNED-NEXT:    s_endpgm
141  %val = load i32, i32 addrspace(3)* %in, align 1
142  store i32 %val, i32 addrspace(3)* %out, align 1
143  ret void
144}
145
146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
147; ALIGNED-SDAG-LABEL: ds4align2:
148; ALIGNED-SDAG:       ; %bb.0:
149; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
151; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
152; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
153; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:2
154; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
156; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v1
157; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
158; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v0 offset:2
159; ALIGNED-SDAG-NEXT:    s_endpgm
160;
161; ALIGNED-GISEL-LABEL: ds4align2:
162; ALIGNED-GISEL:       ; %bb.0:
163; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
164; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
165; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
166; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
167; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:2
168; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
169; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
171; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
172; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0
173; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v1 offset:2
174; ALIGNED-GISEL-NEXT:    s_endpgm
175;
176; UNALIGNED-LABEL: ds4align2:
177; UNALIGNED:       ; %bb.0:
178; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
179; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
180; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
181; UNALIGNED-NEXT:    ds_read_b32 v0, v0
182; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
183; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
184; UNALIGNED-NEXT:    ds_write_b32 v1, v0
185; UNALIGNED-NEXT:    s_endpgm
186  %val = load i32, i32 addrspace(3)* %in, align 2
187  store i32 %val, i32 addrspace(3)* %out, align 2
188  ret void
189}
190
191define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
192; GCN-LABEL: ds4align4:
193; GCN:       ; %bb.0:
194; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
195; GCN-NEXT:    s_waitcnt lgkmcnt(0)
196; GCN-NEXT:    v_mov_b32_e32 v0, s0
197; GCN-NEXT:    ds_read_b32 v0, v0
198; GCN-NEXT:    v_mov_b32_e32 v1, s1
199; GCN-NEXT:    s_waitcnt lgkmcnt(0)
200; GCN-NEXT:    ds_write_b32 v1, v0
201; GCN-NEXT:    s_endpgm
202  %val = load i32, i32 addrspace(3)* %in, align 4
203  store i32 %val, i32 addrspace(3)* %out, align 4
204  ret void
205}
206
207define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
208; ALIGNED-SDAG-LABEL: ds8align1:
209; ALIGNED-SDAG:       ; %bb.0:
210; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
211; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
212; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
213; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
214; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
215; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
216; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
217; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
218; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
219; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
220; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
221; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
222; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
223; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
224; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
225; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
226; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
227; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
228; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
229; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
230; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
231; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
232; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
233; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
234; ALIGNED-SDAG-NEXT:    s_endpgm
235;
236; ALIGNED-GISEL-LABEL: ds8align1:
237; ALIGNED-GISEL:       ; %bb.0:
238; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
239; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
240; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
241; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
242; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
243; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
244; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
245; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
246; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
247; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
248; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
249; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
250; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
251; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
252; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
253; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
254; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
255; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
256; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
257; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
258; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
259; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
260; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v2
261; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
262; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
263; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
264; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1
265; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:1
266; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
267; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:2
268; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:3
269; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
270; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
271; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:4
272; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:5
273; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v0, 8, v1
274; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:6
275; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:7
276; ALIGNED-GISEL-NEXT:    s_endpgm
277;
278; UNALIGNED-LABEL: ds8align1:
279; UNALIGNED:       ; %bb.0:
280; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
281; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
282; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
283; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
284; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
285; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
286; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
287; UNALIGNED-NEXT:    s_endpgm
288  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
289  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
290  ret void
291}
292
293define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
294; ALIGNED-SDAG-LABEL: ds8align2:
295; ALIGNED-SDAG:       ; %bb.0:
296; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
297; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
298; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
299; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:2
300; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
301; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:6
302; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:4
303; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
304; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
305; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:2
306; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
307; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
308; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
309; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:6
310; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
311; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:4
312; ALIGNED-SDAG-NEXT:    s_endpgm
313;
314; ALIGNED-GISEL-LABEL: ds8align2:
315; ALIGNED-GISEL:       ; %bb.0:
316; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
317; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
318; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
319; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
320; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
321; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
322; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
323; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
324; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
325; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
326; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
327; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
328; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
329; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
330; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
331; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:4
332; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:2
333; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1 offset:6
334; ALIGNED-GISEL-NEXT:    s_endpgm
335;
336; UNALIGNED-LABEL: ds8align2:
337; UNALIGNED:       ; %bb.0:
338; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
339; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
340; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
341; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
342; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
343; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
344; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
345; UNALIGNED-NEXT:    s_endpgm
346  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
347  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
348  ret void
349}
350
351define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
352; GCN-LABEL: ds8align4:
353; GCN:       ; %bb.0:
354; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
355; GCN-NEXT:    s_waitcnt lgkmcnt(0)
356; GCN-NEXT:    v_mov_b32_e32 v0, s0
357; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
358; GCN-NEXT:    v_mov_b32_e32 v2, s1
359; GCN-NEXT:    s_waitcnt lgkmcnt(0)
360; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
361; GCN-NEXT:    s_endpgm
362  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
363  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
364  ret void
365}
366
367define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
368; GCN-LABEL: ds8align8:
369; GCN:       ; %bb.0:
370; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    v_mov_b32_e32 v0, s0
373; GCN-NEXT:    ds_read_b64 v[0:1], v0
374; GCN-NEXT:    v_mov_b32_e32 v2, s1
375; GCN-NEXT:    s_waitcnt lgkmcnt(0)
376; GCN-NEXT:    ds_write_b64 v2, v[0:1]
377; GCN-NEXT:    s_endpgm
378  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
379  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
380  ret void
381}
382
383define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
384; ALIGNED-SDAG-LABEL: ds12align1:
385; ALIGNED-SDAG:       ; %bb.0:
386; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
387; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
388; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
389; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
390; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
391; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
392; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
393; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
394; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
395; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
396; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
397; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
398; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
399; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
400; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
401; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
402; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
403; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
404; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
405; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
406; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
407; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
408; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
409; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
410; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
411; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
412; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
413; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
414; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
415; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
416; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
417; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
418; ALIGNED-SDAG-NEXT:    s_endpgm
419;
420; ALIGNED-GISEL-LABEL: ds12align1:
421; ALIGNED-GISEL:       ; %bb.0:
422; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
423; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
424; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
425; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
426; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
427; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
428; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
429; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
430; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
431; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
432; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
433; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
434; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
435; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
436; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
437; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
438; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
439; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
440; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
441; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
442; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
443; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
444; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:11
445; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
446; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
447; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
448; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
449; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
450; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
451; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
452; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
453; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
454; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v4, v3
455; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
456; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
457; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
458; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
459; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1
460; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:1
461; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
462; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:2
463; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:3
464; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
465; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v2
466; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:4
467; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:5
468; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
469; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:6
470; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:7
471; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
472; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
473; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:8
474; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:9
475; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v0, 8, v1
476; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:10
477; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:11
478; ALIGNED-GISEL-NEXT:    s_endpgm
479;
480; UNALIGNED-LABEL: ds12align1:
481; UNALIGNED:       ; %bb.0:
482; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
483; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
484; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
485; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
486; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
487; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
488; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
489; UNALIGNED-NEXT:    s_endpgm
490  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
491  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
492  ret void
493}
494
495define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
496; ALIGNED-SDAG-LABEL: ds12align2:
497; ALIGNED-SDAG:       ; %bb.0:
498; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
499; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
500; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
501; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:8
502; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
503; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
504; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
505; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
506; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
507; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
508; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
509; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1 offset:8
510; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
511; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:2
512; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2
513; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
514; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:4
515; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
516; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:6
517; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
518; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:10
519; ALIGNED-SDAG-NEXT:    s_endpgm
520;
521; ALIGNED-GISEL-LABEL: ds12align2:
522; ALIGNED-GISEL:       ; %bb.0:
523; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
524; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
525; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
526; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
527; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
528; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
529; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
530; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
531; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:10
532; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
533; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
534; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
535; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
536; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
537; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
538; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
539; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
540; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1
541; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v3 offset:2
542; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
543; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v2 offset:4
544; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1 offset:6
545; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
546; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v0 offset:8
547; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1 offset:10
548; ALIGNED-GISEL-NEXT:    s_endpgm
549;
550; UNALIGNED-LABEL: ds12align2:
551; UNALIGNED:       ; %bb.0:
552; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
553; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
554; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
555; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
556; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
557; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
558; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
559; UNALIGNED-NEXT:    s_endpgm
560  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
561  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
562  ret void
563}
564
565define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
566; ALIGNED-LABEL: ds12align4:
567; ALIGNED:       ; %bb.0:
568; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
569; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
570; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
571; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
572; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
573; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
574; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
575; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
576; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
577; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
578; ALIGNED-NEXT:    s_endpgm
579;
580; UNALIGNED-LABEL: ds12align4:
581; UNALIGNED:       ; %bb.0:
582; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
583; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
584; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
585; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
586; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
587; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
588; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
589; UNALIGNED-NEXT:    s_endpgm
590  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
591  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
592  ret void
593}
594
595; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
596define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
597; ALIGNED-SDAG-LABEL: ds12align8:
598; ALIGNED-SDAG:       ; %bb.0:
599; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
600; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
601; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
602; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v0 offset:8
603; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
604; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
605; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
606; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
607; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
608; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
609; ALIGNED-SDAG-NEXT:    s_endpgm
610;
611; ALIGNED-GISEL-LABEL: ds12align8:
612; ALIGNED-GISEL:       ; %bb.0:
613; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
614; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
615; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
616; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
617; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
618; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
619; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
620; ALIGNED-GISEL-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
621; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
622; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
623; ALIGNED-GISEL-NEXT:    s_endpgm
624;
625; UNALIGNED-LABEL: ds12align8:
626; UNALIGNED:       ; %bb.0:
627; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
628; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
629; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
630; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
631; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
632; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
633; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
634; UNALIGNED-NEXT:    s_endpgm
635  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
636  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
637  ret void
638}
639
640define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
641; GCN-LABEL: ds12align16:
642; GCN:       ; %bb.0:
643; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
644; GCN-NEXT:    s_waitcnt lgkmcnt(0)
645; GCN-NEXT:    v_mov_b32_e32 v0, s0
646; GCN-NEXT:    ds_read_b96 v[0:2], v0
647; GCN-NEXT:    v_mov_b32_e32 v3, s1
648; GCN-NEXT:    s_waitcnt lgkmcnt(0)
649; GCN-NEXT:    ds_write_b96 v3, v[0:2]
650; GCN-NEXT:    s_endpgm
651  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
652  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
653  ret void
654}
655
656define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
657; ALIGNED-SDAG-LABEL: ds16align1:
658; ALIGNED-SDAG:       ; %bb.0:
659; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
660; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
661; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
662; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
663; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
664; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
665; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
666; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
667; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
668; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
669; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
670; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
671; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
672; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
673; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
674; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
675; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
676; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
677; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
678; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
679; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
680; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
681; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
682; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
683; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
684; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
685; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
686; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
687; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
688; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
689; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
690; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
691; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
692; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
693; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
694; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
695; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
696; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
697; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
698; ALIGNED-SDAG-NEXT:    s_endpgm
699;
700; ALIGNED-GISEL-LABEL: ds16align1:
701; ALIGNED-GISEL:       ; %bb.0:
702; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
703; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
704; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
705; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
706; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
707; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
708; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
709; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
710; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
711; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
712; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
713; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
714; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
715; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
716; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
717; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
718; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
719; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
720; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
721; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
722; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
723; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
724; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v3, v4, v2
725; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
726; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
727; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
728; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:11
729; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:12
730; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:13
731; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:14
732; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
733; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
734; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
735; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
736; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
737; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
738; ALIGNED-GISEL-NEXT:    v_or3_b32 v3, v4, v5, v3
739; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
740; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
741; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
742; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
743; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
744; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
745; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
746; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
747; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v5, 8, v1
748; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1
749; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v5 offset:1
750; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v4
751; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v4 offset:2
752; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:3
753; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
754; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v4, 8, v2
755; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:4
756; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v4 offset:5
757; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
758; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:6
759; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:7
760; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
761; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v3
762; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v3 offset:8
763; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:9
764; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
765; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:10
766; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:11
767; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
768; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v0
769; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v0 offset:12
770; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v2 offset:13
771; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v0, 8, v1
772; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v1 offset:14
773; ALIGNED-GISEL-NEXT:    ds_write_b8 v6, v0 offset:15
774; ALIGNED-GISEL-NEXT:    s_endpgm
775;
776; UNALIGNED-LABEL: ds16align1:
777; UNALIGNED:       ; %bb.0:
778; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
779; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
780; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
781; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
782; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
783; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
784; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
785; UNALIGNED-NEXT:    s_endpgm
786  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
787  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
788  ret void
789}
790
791define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
792; ALIGNED-SDAG-LABEL: ds16align2:
793; ALIGNED-SDAG:       ; %bb.0:
794; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
795; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
796; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
797; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:12
798; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
799; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
800; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
801; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
802; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
803; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
804; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v8, s1
805; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
806; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
807; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v1 offset:12
808; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
809; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v3 offset:2
810; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v2
811; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
812; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v4 offset:4
813; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
814; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v6 offset:8
815; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v5 offset:6
816; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
817; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v7 offset:10
818; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
819; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v0 offset:14
820; ALIGNED-SDAG-NEXT:    s_endpgm
821;
822; ALIGNED-GISEL-LABEL: ds16align2:
823; ALIGNED-GISEL:       ; %bb.0:
824; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
825; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
826; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
827; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
828; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
829; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
830; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
831; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
832; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
833; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:12
834; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
835; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
836; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
837; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
838; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
839; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
840; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
841; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
842; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
843; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v1
844; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v4 offset:2
845; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
846; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
847; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
848; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v2 offset:4
849; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v1 offset:6
850; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
851; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v3 offset:8
852; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v1 offset:10
853; ALIGNED-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
854; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v0 offset:12
855; ALIGNED-GISEL-NEXT:    ds_write_b16 v5, v1 offset:14
856; ALIGNED-GISEL-NEXT:    s_endpgm
857;
858; UNALIGNED-LABEL: ds16align2:
859; UNALIGNED:       ; %bb.0:
860; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
861; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
862; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
863; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
864; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
865; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
866; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
867; UNALIGNED-NEXT:    s_endpgm
868  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
869  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
870  ret void
871}
872
873define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
874; ALIGNED-SDAG-LABEL: ds16align4:
875; ALIGNED-SDAG:       ; %bb.0:
876; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
877; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
878; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
879; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
880; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
881; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
882; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
883; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
884; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
885; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
886; ALIGNED-SDAG-NEXT:    s_endpgm
887;
888; ALIGNED-GISEL-LABEL: ds16align4:
889; ALIGNED-GISEL:       ; %bb.0:
890; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
891; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
892; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
893; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
894; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
895; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
896; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
897; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
898; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
899; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
900; ALIGNED-GISEL-NEXT:    s_endpgm
901;
902; UNALIGNED-LABEL: ds16align4:
903; UNALIGNED:       ; %bb.0:
904; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
905; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
906; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
907; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
908; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
909; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
910; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
911; UNALIGNED-NEXT:    s_endpgm
912  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
913  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
914  ret void
915}
916
917define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
918; GCN-LABEL: ds16align8:
919; GCN:       ; %bb.0:
920; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
921; GCN-NEXT:    s_waitcnt lgkmcnt(0)
922; GCN-NEXT:    v_mov_b32_e32 v0, s0
923; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
924; GCN-NEXT:    v_mov_b32_e32 v4, s1
925; GCN-NEXT:    s_waitcnt lgkmcnt(0)
926; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
927; GCN-NEXT:    s_endpgm
928  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
929  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
930  ret void
931}
932
933define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
934; GCN-LABEL: ds16align16:
935; GCN:       ; %bb.0:
936; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
937; GCN-NEXT:    s_waitcnt lgkmcnt(0)
938; GCN-NEXT:    v_mov_b32_e32 v0, s0
939; GCN-NEXT:    ds_read_b128 v[0:3], v0
940; GCN-NEXT:    v_mov_b32_e32 v4, s1
941; GCN-NEXT:    s_waitcnt lgkmcnt(0)
942; GCN-NEXT:    ds_write_b128 v4, v[0:3]
943; GCN-NEXT:    s_endpgm
944  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
945  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
946  ret void
947}
948