1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
5; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED
6
7define amdgpu_kernel void @ds1align1(i8 addrspace(3)* %in, i8 addrspace(3)* %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, i8 addrspace(3)* %in, align 1
19  store i8 %val, i8 addrspace(3)* %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
24; ALIGNED-SDAG-LABEL: ds2align1:
25; ALIGNED-SDAG:       ; %bb.0:
26; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
27; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-SDAG-NEXT:    s_endpgm
37;
38; ALIGNED-GISEL-LABEL: ds2align1:
39; ALIGNED-GISEL:       ; %bb.0:
40; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
41; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
42; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
43; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
44; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:1
45; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
46; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
47; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
48; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
49; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v0
50; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v1 offset:1
51; ALIGNED-GISEL-NEXT:    s_endpgm
52;
53; UNALIGNED-LABEL: ds2align1:
54; UNALIGNED:       ; %bb.0:
55; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
56; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
57; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
58; UNALIGNED-NEXT:    ds_read_u16 v0, v0
59; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
60; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
61; UNALIGNED-NEXT:    ds_write_b16 v1, v0
62; UNALIGNED-NEXT:    s_endpgm
63  %val = load i16, i16 addrspace(3)* %in, align 1
64  store i16 %val, i16 addrspace(3)* %out, align 1
65  ret void
66}
67
68define amdgpu_kernel void @ds2align2(i16 addrspace(3)* %in, i16 addrspace(3)* %out) {
69; GCN-LABEL: ds2align2:
70; GCN:       ; %bb.0:
71; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
72; GCN-NEXT:    s_waitcnt lgkmcnt(0)
73; GCN-NEXT:    v_mov_b32_e32 v0, s0
74; GCN-NEXT:    ds_read_u16 v0, v0
75; GCN-NEXT:    v_mov_b32_e32 v1, s1
76; GCN-NEXT:    s_waitcnt lgkmcnt(0)
77; GCN-NEXT:    ds_write_b16 v1, v0
78; GCN-NEXT:    s_endpgm
79  %val = load i16, i16 addrspace(3)* %in, align 2
80  store i16 %val, i16 addrspace(3)* %out, align 2
81  ret void
82}
83
84define amdgpu_kernel void @ds4align1(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
85; ALIGNED-SDAG-LABEL: ds4align1:
86; ALIGNED-SDAG:       ; %bb.0:
87; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
88; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
89; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
90; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
91; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
92; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
93; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:3
94; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
95; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
96; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v1
97; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
98; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v2 offset:1
99; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
100; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v3 offset:2
101; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
102; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v0 offset:3
103; ALIGNED-SDAG-NEXT:    s_endpgm
104;
105; ALIGNED-GISEL-LABEL: ds4align1:
106; ALIGNED-GISEL:       ; %bb.0:
107; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
108; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
109; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
110; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
111; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
112; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:3
113; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:2
114; ALIGNED-GISEL-NEXT:    s_mov_b32 s0, 8
115; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
116; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
117; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
118; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
119; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
120; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
121; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
122; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
123; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
124; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0
125; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:1
126; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
127; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:2
128; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:3
129; ALIGNED-GISEL-NEXT:    s_endpgm
130;
131; UNALIGNED-LABEL: ds4align1:
132; UNALIGNED:       ; %bb.0:
133; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
134; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
135; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
136; UNALIGNED-NEXT:    ds_read_b32 v0, v0
137; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
138; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
139; UNALIGNED-NEXT:    ds_write_b32 v1, v0
140; UNALIGNED-NEXT:    s_endpgm
141  %val = load i32, i32 addrspace(3)* %in, align 1
142  store i32 %val, i32 addrspace(3)* %out, align 1
143  ret void
144}
145
146define amdgpu_kernel void @ds4align2(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
147; ALIGNED-SDAG-LABEL: ds4align2:
148; ALIGNED-SDAG:       ; %bb.0:
149; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
150; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
151; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
152; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
153; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:2
154; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
156; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v1
157; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
158; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v0 offset:2
159; ALIGNED-SDAG-NEXT:    s_endpgm
160;
161; ALIGNED-GISEL-LABEL: ds4align2:
162; ALIGNED-GISEL:       ; %bb.0:
163; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
164; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
165; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
166; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
167; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:2
168; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
169; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
171; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0
172; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v2, v0 offset:2
173; ALIGNED-GISEL-NEXT:    s_endpgm
174;
175; UNALIGNED-LABEL: ds4align2:
176; UNALIGNED:       ; %bb.0:
177; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
178; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
179; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
180; UNALIGNED-NEXT:    ds_read_b32 v0, v0
181; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
182; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
183; UNALIGNED-NEXT:    ds_write_b32 v1, v0
184; UNALIGNED-NEXT:    s_endpgm
185  %val = load i32, i32 addrspace(3)* %in, align 2
186  store i32 %val, i32 addrspace(3)* %out, align 2
187  ret void
188}
189
190define amdgpu_kernel void @ds4align4(i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
191; GCN-LABEL: ds4align4:
192; GCN:       ; %bb.0:
193; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
194; GCN-NEXT:    s_waitcnt lgkmcnt(0)
195; GCN-NEXT:    v_mov_b32_e32 v0, s0
196; GCN-NEXT:    ds_read_b32 v0, v0
197; GCN-NEXT:    v_mov_b32_e32 v1, s1
198; GCN-NEXT:    s_waitcnt lgkmcnt(0)
199; GCN-NEXT:    ds_write_b32 v1, v0
200; GCN-NEXT:    s_endpgm
201  %val = load i32, i32 addrspace(3)* %in, align 4
202  store i32 %val, i32 addrspace(3)* %out, align 4
203  ret void
204}
205
206define amdgpu_kernel void @ds8align1(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
207; ALIGNED-SDAG-LABEL: ds8align1:
208; ALIGNED-SDAG:       ; %bb.0:
209; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
210; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
211; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
212; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0
213; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:1
214; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:2
215; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:3
216; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:4
217; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:5
218; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
219; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
220; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
221; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
222; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v4 offset:2
223; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
224; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v5 offset:3
225; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v2
226; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v3 offset:1
227; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
228; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v8 offset:6
229; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
230; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v0 offset:7
231; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v6 offset:4
232; ALIGNED-SDAG-NEXT:    ds_write_b8 v1, v7 offset:5
233; ALIGNED-SDAG-NEXT:    s_endpgm
234;
235; ALIGNED-GISEL-LABEL: ds8align1:
236; ALIGNED-GISEL:       ; %bb.0:
237; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
238; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
239; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
240; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
241; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
242; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
243; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
244; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
245; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
246; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
247; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
248; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
249; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
250; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
251; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
252; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
253; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
254; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
255; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
256; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
257; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
258; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
259; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
260; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v2
261; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
262; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
263; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1
264; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:1
265; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
266; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v1 offset:2
267; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:3
268; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
269; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v0 offset:4
270; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:5
271; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
272; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v0 offset:6
273; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:7
274; ALIGNED-GISEL-NEXT:    s_endpgm
275;
276; UNALIGNED-LABEL: ds8align1:
277; UNALIGNED:       ; %bb.0:
278; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
279; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
280; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
281; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
282; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
283; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
284; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
285; UNALIGNED-NEXT:    s_endpgm
286  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 1
287  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 1
288  ret void
289}
290
291define amdgpu_kernel void @ds8align2(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
292; ALIGNED-SDAG-LABEL: ds8align2:
293; ALIGNED-SDAG:       ; %bb.0:
294; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
295; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
296; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
297; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:2
298; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
299; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:6
300; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:4
301; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
302; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
303; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:2
304; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
305; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
306; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
307; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:6
308; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
309; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:4
310; ALIGNED-SDAG-NEXT:    s_endpgm
311;
312; ALIGNED-GISEL-LABEL: ds8align2:
313; ALIGNED-GISEL:       ; %bb.0:
314; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
315; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
316; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
317; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
318; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
319; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
320; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
321; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
322; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
323; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
324; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
325; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
326; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
327; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
328; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:4
329; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:6
330; ALIGNED-GISEL-NEXT:    s_endpgm
331;
332; UNALIGNED-LABEL: ds8align2:
333; UNALIGNED:       ; %bb.0:
334; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
335; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
336; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
337; UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
338; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
339; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
340; UNALIGNED-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
341; UNALIGNED-NEXT:    s_endpgm
342  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 2
343  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 2
344  ret void
345}
346
347define amdgpu_kernel void @ds8align4(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
348; GCN-LABEL: ds8align4:
349; GCN:       ; %bb.0:
350; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
351; GCN-NEXT:    s_waitcnt lgkmcnt(0)
352; GCN-NEXT:    v_mov_b32_e32 v0, s0
353; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
354; GCN-NEXT:    v_mov_b32_e32 v2, s1
355; GCN-NEXT:    s_waitcnt lgkmcnt(0)
356; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
357; GCN-NEXT:    s_endpgm
358  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
359  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 4
360  ret void
361}
362
363define amdgpu_kernel void @ds8align8(<2 x i32> addrspace(3)* %in, <2 x i32> addrspace(3)* %out) {
364; GCN-LABEL: ds8align8:
365; GCN:       ; %bb.0:
366; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
367; GCN-NEXT:    s_waitcnt lgkmcnt(0)
368; GCN-NEXT:    v_mov_b32_e32 v0, s0
369; GCN-NEXT:    ds_read_b64 v[0:1], v0
370; GCN-NEXT:    v_mov_b32_e32 v2, s1
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    ds_write_b64 v2, v[0:1]
373; GCN-NEXT:    s_endpgm
374  %val = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 8
375  store <2 x i32> %val, <2 x i32> addrspace(3)* %out, align 8
376  ret void
377}
378
379define amdgpu_kernel void @ds12align1(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
380; ALIGNED-SDAG-LABEL: ds12align1:
381; ALIGNED-SDAG:       ; %bb.0:
382; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
383; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
384; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
385; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
386; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
387; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
388; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
389; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
390; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
391; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
392; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
393; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
394; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
395; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
396; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
397; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
398; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
399; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
400; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
401; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
402; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
403; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
404; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
405; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
406; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
407; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
408; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
409; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
410; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
411; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
412; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
413; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
414; ALIGNED-SDAG-NEXT:    s_endpgm
415;
416; ALIGNED-GISEL-LABEL: ds12align1:
417; ALIGNED-GISEL:       ; %bb.0:
418; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
419; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
420; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
421; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
422; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
423; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
424; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
425; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
426; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
427; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
428; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
429; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
430; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
431; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
432; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
433; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
434; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
435; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
436; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
437; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
438; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
439; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
440; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
441; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:11
442; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
443; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
444; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
445; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
446; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
447; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
448; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
449; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
450; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
451; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
452; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v4, v3
453; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
454; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
455; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1
456; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:1
457; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
458; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v1 offset:2
459; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:3
460; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
461; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:4
462; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:5
463; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
464; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v2 offset:6
465; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:7
466; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
467; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:8
468; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:9
469; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
470; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:10
471; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:11
472; ALIGNED-GISEL-NEXT:    s_endpgm
473;
474; UNALIGNED-LABEL: ds12align1:
475; UNALIGNED:       ; %bb.0:
476; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
477; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
478; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
479; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
480; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
481; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
482; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
483; UNALIGNED-NEXT:    s_endpgm
484  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 1
485  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 1
486  ret void
487}
488
489define amdgpu_kernel void @ds12align2(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
490; ALIGNED-SDAG-LABEL: ds12align2:
491; ALIGNED-SDAG:       ; %bb.0:
492; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
493; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
494; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
495; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
496; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0 offset:2
497; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:4
498; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:10
499; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:8
500; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:6
501; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
502; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
503; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:4
504; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
505; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:10
506; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
507; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:8
508; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1
509; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2 offset:2
510; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
511; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:6
512; ALIGNED-SDAG-NEXT:    s_endpgm
513;
514; ALIGNED-GISEL-LABEL: ds12align2:
515; ALIGNED-GISEL:       ; %bb.0:
516; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
517; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
518; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
519; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
520; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
521; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
522; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
523; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
524; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:10
525; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
526; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
527; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
528; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
529; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
530; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
531; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
532; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1
533; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v1 offset:2
534; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v2 offset:4
535; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v2 offset:6
536; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v0 offset:8
537; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v0 offset:10
538; ALIGNED-GISEL-NEXT:    s_endpgm
539;
540; UNALIGNED-LABEL: ds12align2:
541; UNALIGNED:       ; %bb.0:
542; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
543; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
544; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
545; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
546; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
547; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
548; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
549; UNALIGNED-NEXT:    s_endpgm
550  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 2
551  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 2
552  ret void
553}
554
555define amdgpu_kernel void @ds12align4(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
556; ALIGNED-LABEL: ds12align4:
557; ALIGNED:       ; %bb.0:
558; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
559; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
560; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
561; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
562; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
563; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
564; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
565; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
566; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
567; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
568; ALIGNED-NEXT:    s_endpgm
569;
570; UNALIGNED-LABEL: ds12align4:
571; UNALIGNED:       ; %bb.0:
572; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
573; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
574; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
575; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
576; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
577; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
578; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
579; UNALIGNED-NEXT:    s_endpgm
580  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 4
581  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 4
582  ret void
583}
584
585; TODO: Why does the ALIGNED-SDAG code use ds_write_b64 but not ds_read_b64?
586define amdgpu_kernel void @ds12align8(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
587; ALIGNED-SDAG-LABEL: ds12align8:
588; ALIGNED-SDAG:       ; %bb.0:
589; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
590; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
591; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
592; ALIGNED-SDAG-NEXT:    ds_read_b64 v[0:1], v2
593; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v2 offset:8
594; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
595; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
596; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
597; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
598; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
599; ALIGNED-SDAG-NEXT:    s_endpgm
600;
601; ALIGNED-GISEL-LABEL: ds12align8:
602; ALIGNED-GISEL:       ; %bb.0:
603; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
604; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
605; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
606; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
607; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
608; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
609; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
610; ALIGNED-GISEL-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
611; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
612; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
613; ALIGNED-GISEL-NEXT:    s_endpgm
614;
615; UNALIGNED-LABEL: ds12align8:
616; UNALIGNED:       ; %bb.0:
617; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
618; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
619; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
620; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
621; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
622; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
623; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
624; UNALIGNED-NEXT:    s_endpgm
625  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 8
626  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 8
627  ret void
628}
629
630define amdgpu_kernel void @ds12align16(<3 x i32> addrspace(3)* %in, <3 x i32> addrspace(3)* %out) {
631; GCN-LABEL: ds12align16:
632; GCN:       ; %bb.0:
633; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
634; GCN-NEXT:    s_waitcnt lgkmcnt(0)
635; GCN-NEXT:    v_mov_b32_e32 v0, s0
636; GCN-NEXT:    ds_read_b96 v[0:2], v0
637; GCN-NEXT:    v_mov_b32_e32 v3, s1
638; GCN-NEXT:    s_waitcnt lgkmcnt(0)
639; GCN-NEXT:    ds_write_b96 v3, v[0:2]
640; GCN-NEXT:    s_endpgm
641  %val = load <3 x i32>, <3 x i32> addrspace(3)* %in, align 16
642  store <3 x i32> %val, <3 x i32> addrspace(3)* %out, align 16
643  ret void
644}
645
646define amdgpu_kernel void @ds16align1(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
647; ALIGNED-SDAG-LABEL: ds16align1:
648; ALIGNED-SDAG:       ; %bb.0:
649; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
650; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
651; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
652; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
653; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
654; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
655; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
656; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
657; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
658; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
659; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
660; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
661; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
662; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
663; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
664; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
665; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
666; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
667; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
668; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
669; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
670; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
671; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
672; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
673; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
674; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
675; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
676; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
677; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
678; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
679; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
680; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
681; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
682; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
683; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
684; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
685; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
686; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
687; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
688; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
689; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(13)
690; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
691; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
692; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
693; ALIGNED-SDAG-NEXT:    s_endpgm
694;
695; ALIGNED-GISEL-LABEL: ds16align1:
696; ALIGNED-GISEL:       ; %bb.0:
697; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
698; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
699; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
700; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
701; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
702; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
703; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
704; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
705; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
706; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
707; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
708; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
709; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
710; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
711; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
712; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
713; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
714; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
715; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
716; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
717; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
718; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
719; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
720; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v3, v4, v2
721; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
722; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
723; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
724; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:11
725; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:12
726; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:13
727; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:14
728; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
729; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
730; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
731; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
732; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
733; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
734; ALIGNED-GISEL-NEXT:    v_or3_b32 v3, v4, v5, v3
735; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
736; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
737; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
738; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
739; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
740; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
741; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
742; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
743; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1
744; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:1
745; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
746; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v1 offset:2
747; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:3
748; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
749; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:4
750; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:5
751; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
752; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v2 offset:6
753; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:7
754; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
755; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:8
756; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:9
757; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
758; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v3 offset:10
759; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:11
760; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
761; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:12
762; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:13
763; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, 8
764; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
765; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:14
766; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:15
767; ALIGNED-GISEL-NEXT:    s_endpgm
768;
769; UNALIGNED-LABEL: ds16align1:
770; UNALIGNED:       ; %bb.0:
771; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
772; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
773; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
774; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
775; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
776; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
777; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
778; UNALIGNED-NEXT:    s_endpgm
779  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 1
780  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 1
781  ret void
782}
783
784define amdgpu_kernel void @ds16align2(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
785; ALIGNED-SDAG-LABEL: ds16align2:
786; ALIGNED-SDAG:       ; %bb.0:
787; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
788; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
789; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
790; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
791; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
792; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
793; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
794; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
795; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
796; ALIGNED-SDAG-NEXT:    ds_read_u16 v8, v0 offset:12
797; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
798; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v1, s1
799; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
800; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v3 offset:2
801; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v2
802; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
803; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v5 offset:6
804; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v4 offset:4
805; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
806; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v7 offset:10
807; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v6 offset:8
808; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
809; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v0 offset:14
810; ALIGNED-SDAG-NEXT:    ds_write_b16 v1, v8 offset:12
811; ALIGNED-SDAG-NEXT:    s_endpgm
812;
813; ALIGNED-GISEL-LABEL: ds16align2:
814; ALIGNED-GISEL:       ; %bb.0:
815; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
816; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
817; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
818; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
819; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
820; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
821; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
822; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
823; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
824; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:12
825; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
826; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
827; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
828; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
829; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
830; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
831; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
832; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
833; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
834; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
835; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
836; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
837; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:4
838; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v2 offset:6
839; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:8
840; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v3 offset:10
841; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:12
842; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:14
843; ALIGNED-GISEL-NEXT:    s_endpgm
844;
845; UNALIGNED-LABEL: ds16align2:
846; UNALIGNED:       ; %bb.0:
847; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
848; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
849; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
850; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
851; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
852; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
853; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
854; UNALIGNED-NEXT:    s_endpgm
855  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 2
856  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 2
857  ret void
858}
859
860define amdgpu_kernel void @ds16align4(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
861; ALIGNED-LABEL: ds16align4:
862; ALIGNED:       ; %bb.0:
863; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
864; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
865; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
866; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
867; ALIGNED-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
868; ALIGNED-NEXT:    v_mov_b32_e32 v4, s1
869; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
870; ALIGNED-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
871; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
872; ALIGNED-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
873; ALIGNED-NEXT:    s_endpgm
874;
875; UNALIGNED-LABEL: ds16align4:
876; UNALIGNED:       ; %bb.0:
877; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
878; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
879; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
880; UNALIGNED-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
881; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
882; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
883; UNALIGNED-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
884; UNALIGNED-NEXT:    s_endpgm
885  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 4
886  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 4
887  ret void
888}
889
890define amdgpu_kernel void @ds16align8(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
891; GCN-LABEL: ds16align8:
892; GCN:       ; %bb.0:
893; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
894; GCN-NEXT:    s_waitcnt lgkmcnt(0)
895; GCN-NEXT:    v_mov_b32_e32 v0, s0
896; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
897; GCN-NEXT:    v_mov_b32_e32 v4, s1
898; GCN-NEXT:    s_waitcnt lgkmcnt(0)
899; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
900; GCN-NEXT:    s_endpgm
901  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 8
902  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 8
903  ret void
904}
905
906define amdgpu_kernel void @ds16align16(<4 x i32> addrspace(3)* %in, <4 x i32> addrspace(3)* %out) {
907; GCN-LABEL: ds16align16:
908; GCN:       ; %bb.0:
909; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
910; GCN-NEXT:    s_waitcnt lgkmcnt(0)
911; GCN-NEXT:    v_mov_b32_e32 v0, s0
912; GCN-NEXT:    ds_read_b128 v[0:3], v0
913; GCN-NEXT:    v_mov_b32_e32 v4, s1
914; GCN-NEXT:    s_waitcnt lgkmcnt(0)
915; GCN-NEXT:    ds_write_b128 v4, v[0:3]
916; GCN-NEXT:    s_endpgm
917  %val = load <4 x i32>, <4 x i32> addrspace(3)* %in, align 16
918  store <4 x i32> %val, <4 x i32> addrspace(3)* %out, align 16
919  ret void
920}
921