1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
6
7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, i8 addrspace(3)* %in, align 1
19  store i8 %val, i8 addrspace(3)* %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
24; ALIGNED-SDAG-LABEL: ds2align1:
25; ALIGNED-SDAG:       ; %bb.0:
26; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
27; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-SDAG-NEXT:    s_endpgm
37;
38; ALIGNED-GISEL-LABEL: ds2align1:
39; ALIGNED-GISEL:       ; %bb.0:
40; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
42; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
43; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
44; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:1
45; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
46; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
47; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
48; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
49; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v0
50; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v1 offset:1
51; ALIGNED-GISEL-NEXT:    s_endpgm
52;
53; UNALIGNED-LABEL: ds2align1:
54; UNALIGNED:       ; %bb.0:
55; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
56; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
57; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
58; UNALIGNED-NEXT:    ds_read_u16 v0, v0
59; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
60; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
61; UNALIGNED-NEXT:    ds_write_b16 v1, v0
62; UNALIGNED-NEXT:    s_endpgm
63  %val = load i16, i16 addrspace(3)* %in, align 1
64  store i16 %val, i16 addrspace(3)* %out, align 1
65  ret void
66}
67
68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
69; GCN-LABEL: ds2align2:
70; GCN:       ; %bb.0:
71; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
72; GCN-NEXT:    s_waitcnt lgkmcnt(0)
73; GCN-NEXT:    v_mov_b32_e32 v0, s0
74; GCN-NEXT:    ds_read_u16 v0, v0
75; GCN-NEXT:    v_mov_b32_e32 v1, s1
76; GCN-NEXT:    s_waitcnt lgkmcnt(0)
77; GCN-NEXT:    ds_write_b16 v1, v0
78; GCN-NEXT:    s_endpgm
79  %val = load i16, i16 addrspace(3)* %in, align 2
80  store i16 %val, i16 addrspace(3)* %out, align 2
81  ret void
82}
83
84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
85; ALIGNED-SDAG-LABEL: ds4align1:
86; ALIGNED-SDAG:       ; %bb.0:
87; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
88; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
89; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
90; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
91; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
92; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
93; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:3
94; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
95; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
96; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v1
97; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
98; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v2 offset:1
99; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
100; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v3 offset:2
101; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
102; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v0 offset:3
103; ALIGNED-SDAG-NEXT:    s_endpgm
104;
105; ALIGNED-GISEL-LABEL: ds4align1:
106; ALIGNED-GISEL:       ; %bb.0:
107; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
108; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
109; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
110; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
111; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
112; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:3
113; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:2
114; ALIGNED-GISEL-NEXT:    s_mov_b32 s0, 8
115; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
116; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
117; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
118; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
119; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
120; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
121; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
122; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
123; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
124; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0
125; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:1
126; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
127; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:2
128; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:3
129; ALIGNED-GISEL-NEXT:    s_endpgm
130;
131; UNALIGNED-LABEL: ds4align1:
132; UNALIGNED:       ; %bb.0:
133; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
134; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
135; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
136; UNALIGNED-NEXT:    ds_read_b32 v0, v0
137; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
138; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
139; UNALIGNED-NEXT:    ds_write_b32 v1, v0
140; UNALIGNED-NEXT:    s_endpgm
141  %val = load i32, i32 addrspace(3)* %in, align 1
142  store i32 %val, i32 addrspace(3)* %out, align 1
143  ret void
144}
145
146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
147; ALIGNED-SDAG-LABEL: ds4align2:
148; ALIGNED-SDAG:       ; %bb.0:
149; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
151; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
152; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
153; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:2
154; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
156; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v1
157; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
158; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v0 offset:2
159; ALIGNED-SDAG-NEXT:    s_endpgm
160;
161; ALIGNED-GISEL-LABEL: ds4align2:
162; ALIGNED-GISEL:       ; %bb.0:
163; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
164; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
165; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
166; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
167; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:2
168; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
169; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
171; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0
172; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v2, v0 offset:2
173; ALIGNED-GISEL-NEXT:    s_endpgm
174;
175; UNALIGNED-LABEL: ds4align2:
176; UNALIGNED:       ; %bb.0:
177; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
178; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
179; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
180; UNALIGNED-NEXT:    ds_read_b32 v0, v0
181; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
182; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
183; UNALIGNED-NEXT:    ds_write_b32 v1, v0
184; UNALIGNED-NEXT:    s_endpgm
185  %val = load i32, i32 addrspace(3)* %in, align 2
186  store i32 %val, i32 addrspace(3)* %out, align 2
187  ret void
188}
189
190define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
191; GCN-LABEL: ds4align4:
192; GCN:       ; %bb.0:
193; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
194; GCN-NEXT:    s_waitcnt lgkmcnt(0)
195; GCN-NEXT:    v_mov_b32_e32 v0, s0
196; GCN-NEXT:    ds_read_b32 v0, v0
197; GCN-NEXT:    v_mov_b32_e32 v1, s1
198; GCN-NEXT:    s_waitcnt lgkmcnt(0)
199; GCN-NEXT:    ds_write_b32 v1, v0
200; GCN-NEXT:    s_endpgm
201  %val = load i32, i32 addrspace(3)* %in, align 4
202  store i32 %val, i32 addrspace(3)* %out, align 4
203  ret void
204}
205
206define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
207; ALIGNED-SDAG-LABEL: ds8align1:
208; ALIGNED-SDAG:       ; %bb.0:
209; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
210; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
211; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
212; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
213; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
214; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
215; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
216; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
217; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
218; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
219; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
220; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
221; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
222; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
223; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
224; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
225; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
226; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
227; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
228; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
229; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
230; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
231; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
232; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
233; ALIGNED-SDAG-NEXT:    s_endpgm
234;
235; ALIGNED-GISEL-LABEL: ds8align1:
236; ALIGNED-GISEL:       ; %bb.0:
237; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
238; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
239; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
240; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
241; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
242; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
243; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
244; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
245; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
246; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
247; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
248; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
249; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
250; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
251; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
252; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
253; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
254; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
255; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
256; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
257; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
258; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
259; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
260; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v2
261; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
262; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
263; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1
264; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:1
265; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
266; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v1 offset:2
267; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:3
268; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
269; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v0 offset:4
270; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:5
271; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
272; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v0 offset:6
273; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:7
274; ALIGNED-GISEL-NEXT:    s_endpgm
275;
276; UNALIGNED-LABEL: ds8align1:
277; UNALIGNED:       ; %bb.0:
278; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
279; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
280; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
281; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
282; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
283; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
284; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
285; UNALIGNED-NEXT:    s_endpgm
286  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
287  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
288  ret void
289}
290
291define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
292; ALIGNED-SDAG-LABEL: ds8align2:
293; ALIGNED-SDAG:       ; %bb.0:
294; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
295; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
296; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
297; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:2
298; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
299; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:6
300; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:4
301; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
302; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
303; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:2
304; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
305; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
306; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
307; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:6
308; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
309; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:4
310; ALIGNED-SDAG-NEXT:    s_endpgm
311;
312; ALIGNED-GISEL-LABEL: ds8align2:
313; ALIGNED-GISEL:       ; %bb.0:
314; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
315; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
316; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
317; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
318; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
319; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
320; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
321; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
322; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
323; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
324; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
325; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
326; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
327; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
328; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:4
329; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:6
330; ALIGNED-GISEL-NEXT:    s_endpgm
331;
332; UNALIGNED-LABEL: ds8align2:
333; UNALIGNED:       ; %bb.0:
334; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
335; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
336; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
337; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
338; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
339; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
340; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
341; UNALIGNED-NEXT:    s_endpgm
342  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
343  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
344  ret void
345}
346
347define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
348; GCN-LABEL: ds8align4:
349; GCN:       ; %bb.0:
350; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
351; GCN-NEXT:    s_waitcnt lgkmcnt(0)
352; GCN-NEXT:    v_mov_b32_e32 v0, s0
353; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
354; GCN-NEXT:    v_mov_b32_e32 v2, s1
355; GCN-NEXT:    s_waitcnt lgkmcnt(0)
356; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
357; GCN-NEXT:    s_endpgm
358  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
359  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
360  ret void
361}
362
363define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
364; GCN-LABEL: ds8align8:
365; GCN:       ; %bb.0:
366; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
367; GCN-NEXT:    s_waitcnt lgkmcnt(0)
368; GCN-NEXT:    v_mov_b32_e32 v0, s0
369; GCN-NEXT:    ds_read_b64 v[0:1], v0
370; GCN-NEXT:    v_mov_b32_e32 v2, s1
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    ds_write_b64 v2, v[0:1]
373; GCN-NEXT:    s_endpgm
374  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
375  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
376  ret void
377}
378
379define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
380; ALIGNED-SDAG-LABEL: ds12align1:
381; ALIGNED-SDAG:       ; %bb.0:
382; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
383; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
384; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
385; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
386; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
387; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
388; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
389; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
390; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
391; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
392; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
393; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
394; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
395; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
396; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
397; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
398; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
399; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
400; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
401; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
402; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
403; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
404; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
405; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
406; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
407; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
408; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
409; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
410; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
411; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
412; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
413; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
414; ALIGNED-SDAG-NEXT:    s_endpgm
415;
416; ALIGNED-GISEL-LABEL: ds12align1:
417; ALIGNED-GISEL:       ; %bb.0:
418; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
419; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
420; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
421; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
422; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
423; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
424; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
425; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
426; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
427; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
428; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
429; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
430; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
431; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
432; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
433; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
434; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
435; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
436; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
437; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
438; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
439; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
440; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
441; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:11
442; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
443; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
444; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
445; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
446; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
447; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
448; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
449; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
450; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
451; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
452; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v4, v3
453; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
454; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
455; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1
456; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:1
457; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
458; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v1 offset:2
459; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:3
460; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
461; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:4
462; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:5
463; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
464; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v2 offset:6
465; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:7
466; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
467; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:8
468; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:9
469; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
470; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:10
471; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:11
472; ALIGNED-GISEL-NEXT:    s_endpgm
473;
474; UNALIGNED-LABEL: ds12align1:
475; UNALIGNED:       ; %bb.0:
476; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
477; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
478; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
479; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
480; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
481; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
482; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
483; UNALIGNED-NEXT:    s_endpgm
484  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
485  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
486  ret void
487}
488
489define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
490; ALIGNED-SDAG-LABEL: ds12align2:
491; ALIGNED-SDAG:       ; %bb.0:
492; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
493; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
494; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
495; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:8
496; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
497; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
498; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
499; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
500; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
501; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
502; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
503; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1 offset:8
504; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(4)
505; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:2
506; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2
507; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
508; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:4
509; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
510; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:6
511; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
512; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:10
513; ALIGNED-SDAG-NEXT:    s_endpgm
514;
515; ALIGNED-GISEL-LABEL: ds12align2:
516; ALIGNED-GISEL:       ; %bb.0:
517; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
518; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
519; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
520; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
521; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
522; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
523; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
524; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
525; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:10
526; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
527; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
528; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
529; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
530; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
531; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
532; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
533; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1
534; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v1 offset:2
535; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v2 offset:4
536; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v2 offset:6
537; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v0 offset:8
538; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v0 offset:10
539; ALIGNED-GISEL-NEXT:    s_endpgm
540;
541; UNALIGNED-LABEL: ds12align2:
542; UNALIGNED:       ; %bb.0:
543; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
544; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
545; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
546; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
547; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
548; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
549; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
550; UNALIGNED-NEXT:    s_endpgm
551  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
552  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
553  ret void
554}
555
556define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
557; ALIGNED-LABEL: ds12align4:
558; ALIGNED:       ; %bb.0:
559; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
560; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
561; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
562; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
563; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
564; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
565; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
566; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
567; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
568; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
569; ALIGNED-NEXT:    s_endpgm
570;
571; UNALIGNED-LABEL: ds12align4:
572; UNALIGNED:       ; %bb.0:
573; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
574; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
575; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
576; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
577; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
578; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
579; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
580; UNALIGNED-NEXT:    s_endpgm
581  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
582  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
583  ret void
584}
585
586; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
587define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
588; ALIGNED-SDAG-LABEL: ds12align8:
589; ALIGNED-SDAG:       ; %bb.0:
590; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
591; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
592; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
593; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v0 offset:8
594; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
595; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
596; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
597; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
598; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
599; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
600; ALIGNED-SDAG-NEXT:    s_endpgm
601;
602; ALIGNED-GISEL-LABEL: ds12align8:
603; ALIGNED-GISEL:       ; %bb.0:
604; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
605; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
606; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
607; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
608; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
609; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
610; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
611; ALIGNED-GISEL-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
612; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
613; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
614; ALIGNED-GISEL-NEXT:    s_endpgm
615;
616; UNALIGNED-LABEL: ds12align8:
617; UNALIGNED:       ; %bb.0:
618; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
619; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
620; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
621; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
622; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
623; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
624; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
625; UNALIGNED-NEXT:    s_endpgm
626  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
627  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
628  ret void
629}
630
631define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
632; GCN-LABEL: ds12align16:
633; GCN:       ; %bb.0:
634; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
635; GCN-NEXT:    s_waitcnt lgkmcnt(0)
636; GCN-NEXT:    v_mov_b32_e32 v0, s0
637; GCN-NEXT:    ds_read_b96 v[0:2], v0
638; GCN-NEXT:    v_mov_b32_e32 v3, s1
639; GCN-NEXT:    s_waitcnt lgkmcnt(0)
640; GCN-NEXT:    ds_write_b96 v3, v[0:2]
641; GCN-NEXT:    s_endpgm
642  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
643  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
644  ret void
645}
646
647define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
648; ALIGNED-SDAG-LABEL: ds16align1:
649; ALIGNED-SDAG:       ; %bb.0:
650; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
651; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
652; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
653; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
654; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
655; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
656; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
657; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
658; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
659; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
660; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
661; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
662; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
663; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
664; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
665; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
666; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
667; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
668; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
669; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
670; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
671; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
672; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
673; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
674; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
675; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
676; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
677; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
678; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
679; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
680; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
681; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
682; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
683; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
684; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
685; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
686; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
687; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
688; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
689; ALIGNED-SDAG-NEXT:    s_endpgm
690;
691; ALIGNED-GISEL-LABEL: ds16align1:
692; ALIGNED-GISEL:       ; %bb.0:
693; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
694; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
695; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
696; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
697; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
698; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
699; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
700; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
701; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
702; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
703; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
704; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
705; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
706; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
707; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
708; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
709; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
710; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
711; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
712; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
713; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
714; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
715; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
716; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v3, v4, v2
717; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
718; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
719; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
720; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:11
721; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:12
722; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:13
723; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:14
724; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
725; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
726; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
727; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
728; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
729; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
730; ALIGNED-GISEL-NEXT:    v_or3_b32 v3, v4, v5, v3
731; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
732; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
733; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
734; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
735; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
736; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
737; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
738; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
739; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1
740; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:1
741; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
742; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v1 offset:2
743; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:3
744; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
745; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:4
746; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:5
747; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
748; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v2 offset:6
749; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:7
750; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
751; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:8
752; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:9
753; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
754; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v3 offset:10
755; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:11
756; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
757; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:12
758; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:13
759; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, 8
760; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
761; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:14
762; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:15
763; ALIGNED-GISEL-NEXT:    s_endpgm
764;
765; UNALIGNED-LABEL: ds16align1:
766; UNALIGNED:       ; %bb.0:
767; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
768; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
769; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
770; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
771; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
772; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
773; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
774; UNALIGNED-NEXT:    s_endpgm
775  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
776  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
777  ret void
778}
779
780define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
781; ALIGNED-SDAG-LABEL: ds16align2:
782; ALIGNED-SDAG:       ; %bb.0:
783; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
784; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
785; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
786; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:12
787; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
788; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
789; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
790; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
791; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
792; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
793; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v8, s1
794; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
795; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
796; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v1 offset:12
797; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
798; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v3 offset:2
799; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v2
800; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
801; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v4 offset:4
802; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
803; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v6 offset:8
804; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v5 offset:6
805; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
806; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v7 offset:10
807; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
808; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v0 offset:14
809; ALIGNED-SDAG-NEXT:    s_endpgm
810;
811; ALIGNED-GISEL-LABEL: ds16align2:
812; ALIGNED-GISEL:       ; %bb.0:
813; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
814; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
815; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
816; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
817; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
818; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
819; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
820; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
821; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
822; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:12
823; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
824; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
825; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
826; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
827; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
828; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
829; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
830; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
831; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
832; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
833; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
834; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
835; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:4
836; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v2 offset:6
837; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:8
838; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v3 offset:10
839; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:12
840; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:14
841; ALIGNED-GISEL-NEXT:    s_endpgm
842;
843; UNALIGNED-LABEL: ds16align2:
844; UNALIGNED:       ; %bb.0:
845; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
846; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
847; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
848; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
849; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
850; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
851; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
852; UNALIGNED-NEXT:    s_endpgm
853  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
854  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
855  ret void
856}
857
858define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
859; ALIGNED-SDAG-LABEL: ds16align4:
860; ALIGNED-SDAG:       ; %bb.0:
861; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
862; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
863; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
864; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
865; ALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
866; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
867; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
868; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
869; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
870; ALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
871; ALIGNED-SDAG-NEXT:    s_endpgm
872;
873; ALIGNED-GISEL-LABEL: ds16align4:
874; ALIGNED-GISEL:       ; %bb.0:
875; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
876; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
877; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
878; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
879; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
880; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
881; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
882; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
883; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
884; ALIGNED-GISEL-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
885; ALIGNED-GISEL-NEXT:    s_endpgm
886;
887; UNALIGNED-LABEL: ds16align4:
888; UNALIGNED:       ; %bb.0:
889; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
890; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
891; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
892; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
893; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
894; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
895; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
896; UNALIGNED-NEXT:    s_endpgm
897  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
898  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
899  ret void
900}
901
902define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
903; GCN-LABEL: ds16align8:
904; GCN:       ; %bb.0:
905; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
906; GCN-NEXT:    s_waitcnt lgkmcnt(0)
907; GCN-NEXT:    v_mov_b32_e32 v0, s0
908; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
909; GCN-NEXT:    v_mov_b32_e32 v4, s1
910; GCN-NEXT:    s_waitcnt lgkmcnt(0)
911; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
912; GCN-NEXT:    s_endpgm
913  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
914  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
915  ret void
916}
917
918define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
919; GCN-LABEL: ds16align16:
920; GCN:       ; %bb.0:
921; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
922; GCN-NEXT:    s_waitcnt lgkmcnt(0)
923; GCN-NEXT:    v_mov_b32_e32 v0, s0
924; GCN-NEXT:    ds_read_b128 v[0:3], v0
925; GCN-NEXT:    v_mov_b32_e32 v4, s1
926; GCN-NEXT:    s_waitcnt lgkmcnt(0)
927; GCN-NEXT:    ds_write_b128 v4, v[0:3]
928; GCN-NEXT:    s_endpgm
929  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
930  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
931  ret void
932}
933