1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
6
7; Unaligned DS access in available from GFX9 onwards.
8; LDS alignment enforcement is controlled by a configuration register:
9; SH_MEM_CONFIG.alignment_mode
10
11define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
12; GFX9-LABEL: load_lds_v4i32_align1:
13; GFX9:       ; %bb.0:
14; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15; GFX9-NEXT:    ds_read_b128 v[0:3], v0
16; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
17; GFX9-NEXT:    s_setpc_b64 s[30:31]
18;
19; GFX7-LABEL: load_lds_v4i32_align1:
20; GFX7:       ; %bb.0:
21; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
23; GFX7-NEXT:    ds_read_u8 v2, v0
24; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
25; GFX7-NEXT:    s_mov_b32 m0, -1
26; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
27; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
28; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
29; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
30; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
31; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
32; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
33; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
34; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
35; GFX7-NEXT:    ds_read_u8 v9, v0 offset:8
36; GFX7-NEXT:    ds_read_u8 v10, v0 offset:9
37; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
38; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
39; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
40; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
41; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
42; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
43; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
44; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
45; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
46; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
47; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
48; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
49; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
50; GFX7-NEXT:    ds_read_u8 v3, v0 offset:11
51; GFX7-NEXT:    ds_read_u8 v5, v0 offset:12
52; GFX7-NEXT:    ds_read_u8 v6, v0 offset:13
53; GFX7-NEXT:    ds_read_u8 v7, v0 offset:14
54; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
55; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
56; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
57; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
58; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
59; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
60; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
61; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
62; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
63; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
64; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
65; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
66; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
67; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
69; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
70; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
71; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
72; GFX7-NEXT:    v_mov_b32_e32 v0, v4
73; GFX7-NEXT:    s_setpc_b64 s[30:31]
74;
75; GFX10-LABEL: load_lds_v4i32_align1:
76; GFX10:       ; %bb.0:
77; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
79; GFX10-NEXT:    v_mov_b32_e32 v2, v0
80; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
81; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
82; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX10-NEXT:    s_setpc_b64 s[30:31]
84;
85; GFX11-LABEL: load_lds_v4i32_align1:
86; GFX11:       ; %bb.0:
87; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
89; GFX11-NEXT:    ds_load_b128 v[0:3], v0
90; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX11-NEXT:    s_setpc_b64 s[30:31]
92  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
93  ret <4 x i32> %load
94}
95
96define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
97; GFX9-LABEL: load_lds_v3i32_align1:
98; GFX9:       ; %bb.0:
99; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; GFX9-NEXT:    ds_read_b96 v[0:2], v0
101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
102; GFX9-NEXT:    s_setpc_b64 s[30:31]
103;
104; GFX7-LABEL: load_lds_v3i32_align1:
105; GFX7:       ; %bb.0:
106; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
108; GFX7-NEXT:    ds_read_u8 v2, v0
109; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
110; GFX7-NEXT:    s_mov_b32 m0, -1
111; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
112; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
113; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
114; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
115; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
116; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
117; GFX7-NEXT:    ds_read_u8 v5, v0 offset:5
118; GFX7-NEXT:    ds_read_u8 v6, v0 offset:6
119; GFX7-NEXT:    ds_read_u8 v7, v0 offset:7
120; GFX7-NEXT:    ds_read_u8 v8, v0 offset:8
121; GFX7-NEXT:    ds_read_u8 v9, v0 offset:9
122; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
123; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
124; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
125; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
126; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
127; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
128; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
129; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
130; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
131; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
132; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
133; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
134; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
135; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
136; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
137; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
138; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
139; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
140; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
141; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
142; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
143; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
144; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
145; GFX7-NEXT:    v_mov_b32_e32 v0, v3
146; GFX7-NEXT:    s_setpc_b64 s[30:31]
147;
148; GFX10-LABEL: load_lds_v3i32_align1:
149; GFX10:       ; %bb.0:
150; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
151; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
152; GFX10-NEXT:    v_mov_b32_e32 v2, v0
153; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
154; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
155; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
156; GFX10-NEXT:    s_setpc_b64 s[30:31]
157;
158; GFX11-LABEL: load_lds_v3i32_align1:
159; GFX11:       ; %bb.0:
160; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
162; GFX11-NEXT:    ds_load_b96 v[0:2], v0
163; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
164; GFX11-NEXT:    s_setpc_b64 s[30:31]
165  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
166  ret <3 x i32> %load
167}
168
169define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
170; GFX9-LABEL: store_lds_v4i32_align1:
171; GFX9:       ; %bb.0:
172; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
173; GFX9-NEXT:    ds_write_b128 v0, v[1:4]
174; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX9-NEXT:    s_setpc_b64 s[30:31]
176;
177; GFX7-LABEL: store_lds_v4i32_align1:
178; GFX7:       ; %bb.0:
179; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
180; GFX7-NEXT:    s_mov_b32 m0, -1
181; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
182; GFX7-NEXT:    v_bfe_u32 v6, v1, 8, 8
183; GFX7-NEXT:    ds_write_b8 v0, v1
184; GFX7-NEXT:    ds_write_b8 v0, v6 offset:1
185; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
186; GFX7-NEXT:    ds_write_b8 v0, v5 offset:2
187; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
188; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
189; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
190; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
191; GFX7-NEXT:    ds_write_b8 v0, v5 offset:5
192; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
193; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
194; GFX7-NEXT:    ds_write_b8 v0, v2 offset:7
195; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
196; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
197; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
198; GFX7-NEXT:    ds_write_b8 v0, v2 offset:9
199; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
200; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
201; GFX7-NEXT:    ds_write_b8 v0, v2 offset:11
202; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
203; GFX7-NEXT:    v_bfe_u32 v2, v4, 8, 8
204; GFX7-NEXT:    ds_write_b8 v0, v4 offset:12
205; GFX7-NEXT:    ds_write_b8 v0, v2 offset:13
206; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
207; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
208; GFX7-NEXT:    ds_write_b8 v0, v2 offset:15
209; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX7-NEXT:    s_setpc_b64 s[30:31]
211;
212; GFX10-LABEL: store_lds_v4i32_align1:
213; GFX10:       ; %bb.0:
214; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
216; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
217; GFX10-NEXT:    ds_write2_b32 v0, v3, v4 offset0:2 offset1:3
218; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX10-NEXT:    s_setpc_b64 s[30:31]
220;
221; GFX11-LABEL: store_lds_v4i32_align1:
222; GFX11:       ; %bb.0:
223; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
225; GFX11-NEXT:    ds_store_b128 v0, v[1:4]
226; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
227; GFX11-NEXT:    s_setpc_b64 s[30:31]
228  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
229  ret void
230}
231
232define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
233; GFX9-LABEL: store_lds_v3i32_align1:
234; GFX9:       ; %bb.0:
235; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236; GFX9-NEXT:    ds_write_b96 v0, v[1:3]
237; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX9-NEXT:    s_setpc_b64 s[30:31]
239;
240; GFX7-LABEL: store_lds_v3i32_align1:
241; GFX7:       ; %bb.0:
242; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
243; GFX7-NEXT:    s_mov_b32 m0, -1
244; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
245; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
246; GFX7-NEXT:    ds_write_b8 v0, v1
247; GFX7-NEXT:    ds_write_b8 v0, v5 offset:1
248; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
249; GFX7-NEXT:    ds_write_b8 v0, v4 offset:2
250; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
251; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
252; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
253; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
254; GFX7-NEXT:    ds_write_b8 v0, v4 offset:5
255; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
256; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
257; GFX7-NEXT:    ds_write_b8 v0, v2 offset:7
258; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
259; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
260; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
261; GFX7-NEXT:    ds_write_b8 v0, v2 offset:9
262; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
263; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
264; GFX7-NEXT:    ds_write_b8 v0, v2 offset:11
265; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX7-NEXT:    s_setpc_b64 s[30:31]
267;
268; GFX10-LABEL: store_lds_v3i32_align1:
269; GFX10:       ; %bb.0:
270; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
271; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
272; GFX10-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
273; GFX10-NEXT:    ds_write_b32 v0, v3 offset:8
274; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX10-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX11-LABEL: store_lds_v3i32_align1:
278; GFX11:       ; %bb.0:
279; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
281; GFX11-NEXT:    ds_store_b96 v0, v[1:3]
282; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
283; GFX11-NEXT:    s_setpc_b64 s[30:31]
284  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
285  ret void
286}
287
288define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) {
289; GFX9-LABEL: test_s_load_constant_v8i32_align1:
290; GFX9:       ; %bb.0:
291; GFX9-NEXT:    v_mov_b32_e32 v8, 0
292; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
293; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
294; GFX9-NEXT:    s_waitcnt vmcnt(1)
295; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
296; GFX9-NEXT:    s_waitcnt vmcnt(1)
297; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
298; GFX9-NEXT:    s_endpgm
299;
300; GFX7-LABEL: test_s_load_constant_v8i32_align1:
301; GFX7:       ; %bb.0:
302; GFX7-NEXT:    s_mov_b32 s4, s2
303; GFX7-NEXT:    s_mov_b32 s5, s3
304; GFX7-NEXT:    s_mov_b32 s2, -1
305; GFX7-NEXT:    s_mov_b32 s3, 0xf000
306; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
307; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
308; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
309; GFX7-NEXT:    s_waitcnt vmcnt(1)
310; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
311; GFX7-NEXT:    s_waitcnt vmcnt(1)
312; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
313; GFX7-NEXT:    s_endpgm
314;
315; GFX10-LABEL: test_s_load_constant_v8i32_align1:
316; GFX10:       ; %bb.0:
317; GFX10-NEXT:    v_mov_b32_e32 v8, 0
318; GFX10-NEXT:    s_clause 0x1
319; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
320; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
321; GFX10-NEXT:    s_waitcnt vmcnt(1)
322; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
323; GFX10-NEXT:    s_waitcnt vmcnt(0)
324; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
325; GFX10-NEXT:    s_endpgm
326;
327; GFX11-LABEL: test_s_load_constant_v8i32_align1:
328; GFX11:       ; %bb.0:
329; GFX11-NEXT:    v_mov_b32_e32 v8, 0
330; GFX11-NEXT:    s_clause 0x1
331; GFX11-NEXT:    global_load_b128 v[0:3], v8, s[0:1]
332; GFX11-NEXT:    global_load_b128 v[4:7], v8, s[0:1] offset:16
333; GFX11-NEXT:    s_waitcnt vmcnt(1)
334; GFX11-NEXT:    global_store_b128 v8, v[0:3], s[2:3]
335; GFX11-NEXT:    s_waitcnt vmcnt(0)
336; GFX11-NEXT:    global_store_b128 v8, v[4:7], s[2:3] offset:16
337; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
338; GFX11-NEXT:    s_endpgm
339  %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1
340  store <8 x i32> %load, <8 x i32> addrspace(1)* %out
341  ret void
342}
343