1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+unaligned-access-mode -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
5
6; Unaligned DS access in available from GFX9 onwards.
7; LDS alignment enforcement is controlled by a configuration register:
8; SH_MEM_CONFIG.alignment_mode
9
10define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
11; GFX9-LABEL: load_lds_v4i32_align1:
12; GFX9:       ; %bb.0:
13; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
15; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
16; GFX9-NEXT:    s_setpc_b64 s[30:31]
17;
18; GFX7-LABEL: load_lds_v4i32_align1:
19; GFX7:       ; %bb.0:
20; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
21; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
22; GFX7-NEXT:    ds_read_u8 v2, v0
23; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
24; GFX7-NEXT:    s_mov_b32 m0, -1
25; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
26; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
27; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
28; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
29; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
30; GFX7-NEXT:    ds_read_u8 v5, v0 offset:4
31; GFX7-NEXT:    ds_read_u8 v6, v0 offset:5
32; GFX7-NEXT:    ds_read_u8 v7, v0 offset:6
33; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
34; GFX7-NEXT:    ds_read_u8 v9, v0 offset:8
35; GFX7-NEXT:    ds_read_u8 v10, v0 offset:9
36; GFX7-NEXT:    ds_read_u8 v11, v0 offset:10
37; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
38; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
39; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
40; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
41; GFX7-NEXT:    v_or_b32_e32 v4, v2, v1
42; GFX7-NEXT:    s_waitcnt lgkmcnt(5)
43; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v6
44; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
45; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v8
46; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
47; GFX7-NEXT:    v_or_b32_e32 v1, v1, v5
48; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
49; GFX7-NEXT:    ds_read_u8 v3, v0 offset:11
50; GFX7-NEXT:    ds_read_u8 v5, v0 offset:12
51; GFX7-NEXT:    ds_read_u8 v6, v0 offset:13
52; GFX7-NEXT:    ds_read_u8 v7, v0 offset:14
53; GFX7-NEXT:    ds_read_u8 v0, v0 offset:15
54; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
55; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
56; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v10
57; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
58; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
59; GFX7-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
60; GFX7-NEXT:    v_or_b32_e32 v2, v2, v9
61; GFX7-NEXT:    v_or_b32_e32 v3, v3, v8
62; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
63; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
64; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
65; GFX7-NEXT:    v_or_b32_e32 v3, v3, v5
66; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
68; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
69; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
70; GFX7-NEXT:    v_or_b32_e32 v3, v0, v3
71; GFX7-NEXT:    v_mov_b32_e32 v0, v4
72; GFX7-NEXT:    s_setpc_b64 s[30:31]
73;
74; GFX10-LABEL: load_lds_v4i32_align1:
75; GFX10:       ; %bb.0:
76; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
77; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
78; GFX10-NEXT:    ds_read_u8 v1, v0
79; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
80; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
81; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
82; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
83; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
84; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
85; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
86; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
87; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
88; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
89; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
90; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
91; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
92; GFX10-NEXT:    ds_read_u8 v15, v0 offset:15
93; GFX10-NEXT:    ds_read_u8 v0, v0 offset:14
94; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
95; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
96; GFX10-NEXT:    s_waitcnt lgkmcnt(13)
97; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
98; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
99; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
100; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
101; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
102; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
103; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
104; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
105; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
106; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
107; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
108; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
109; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v11
110; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
111; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v12
112; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
113; GFX10-NEXT:    v_lshl_or_b32 v10, v14, 8, v13
114; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
115; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 24, v15
116; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX10-NEXT:    v_lshlrev_b32_e32 v12, 16, v0
118; GFX10-NEXT:    v_or3_b32 v0, v2, v3, v1
119; GFX10-NEXT:    v_or3_b32 v1, v5, v6, v4
120; GFX10-NEXT:    v_or3_b32 v2, v8, v9, v7
121; GFX10-NEXT:    v_or3_b32 v3, v11, v12, v10
122; GFX10-NEXT:    s_setpc_b64 s[30:31]
123  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
124  ret <4 x i32> %load
125}
126
127define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
128; GFX9-LABEL: load_lds_v3i32_align1:
129; GFX9:       ; %bb.0:
130; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131; GFX9-NEXT:    ds_read_b96 v[0:2], v0
132; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
133; GFX9-NEXT:    s_setpc_b64 s[30:31]
134;
135; GFX7-LABEL: load_lds_v3i32_align1:
136; GFX7:       ; %bb.0:
137; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138; GFX7-NEXT:    ds_read_u8 v1, v0 offset:1
139; GFX7-NEXT:    ds_read_u8 v2, v0
140; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
141; GFX7-NEXT:    s_mov_b32 m0, -1
142; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
143; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
144; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
145; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
146; GFX7-NEXT:    ds_read_u8 v2, v0 offset:3
147; GFX7-NEXT:    ds_read_u8 v4, v0 offset:4
148; GFX7-NEXT:    ds_read_u8 v5, v0 offset:5
149; GFX7-NEXT:    ds_read_u8 v6, v0 offset:6
150; GFX7-NEXT:    ds_read_u8 v7, v0 offset:7
151; GFX7-NEXT:    ds_read_u8 v8, v0 offset:8
152; GFX7-NEXT:    ds_read_u8 v9, v0 offset:9
153; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
154; GFX7-NEXT:    s_waitcnt lgkmcnt(7)
155; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
156; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
157; GFX7-NEXT:    ds_read_u8 v0, v0 offset:11
158; GFX7-NEXT:    v_or_b32_e32 v2, v2, v3
159; GFX7-NEXT:    v_or_b32_e32 v3, v2, v1
160; GFX7-NEXT:    s_waitcnt lgkmcnt(6)
161; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 8, v5
162; GFX7-NEXT:    v_or_b32_e32 v1, v1, v4
163; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
164; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 24, v7
165; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
166; GFX7-NEXT:    v_or_b32_e32 v2, v2, v4
167; GFX7-NEXT:    v_or_b32_e32 v1, v2, v1
168; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
169; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
170; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
172; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v10
173; GFX7-NEXT:    v_or_b32_e32 v2, v2, v8
174; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
175; GFX7-NEXT:    v_or_b32_e32 v2, v0, v2
176; GFX7-NEXT:    v_mov_b32_e32 v0, v3
177; GFX7-NEXT:    s_setpc_b64 s[30:31]
178;
179; GFX10-LABEL: load_lds_v3i32_align1:
180; GFX10:       ; %bb.0:
181; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
182; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
183; GFX10-NEXT:    ds_read_u8 v1, v0
184; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
185; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
186; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
187; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
188; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
189; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
190; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
191; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
192; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
193; GFX10-NEXT:    ds_read_u8 v11, v0 offset:11
194; GFX10-NEXT:    ds_read_u8 v0, v0 offset:10
195; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
196; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
197; GFX10-NEXT:    s_waitcnt lgkmcnt(9)
198; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
199; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
200; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
201; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
202; GFX10-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
203; GFX10-NEXT:    s_waitcnt lgkmcnt(5)
204; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
205; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
206; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v8
207; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
208; GFX10-NEXT:    v_lshl_or_b32 v7, v10, 8, v9
209; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
210; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 24, v11
211; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
213; GFX10-NEXT:    v_or3_b32 v0, v2, v3, v1
214; GFX10-NEXT:    v_or3_b32 v1, v5, v6, v4
215; GFX10-NEXT:    v_or3_b32 v2, v8, v9, v7
216; GFX10-NEXT:    s_setpc_b64 s[30:31]
217  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
218  ret <3 x i32> %load
219}
220
221define void @store_lds_v4i32_align1(<4 x i32> addrspace(3)* %out, <4 x i32> %x) {
222; GFX9-LABEL: store_lds_v4i32_align1:
223; GFX9:       ; %bb.0:
224; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX9-NEXT:    ds_write2_b64 v0, v[1:2], v[3:4] offset1:1
226; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
227; GFX9-NEXT:    s_setpc_b64 s[30:31]
228;
229; GFX7-LABEL: store_lds_v4i32_align1:
230; GFX7:       ; %bb.0:
231; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
232; GFX7-NEXT:    s_mov_b32 m0, -1
233; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
234; GFX7-NEXT:    v_bfe_u32 v6, v1, 8, 8
235; GFX7-NEXT:    ds_write_b8 v0, v1
236; GFX7-NEXT:    ds_write_b8 v0, v6 offset:1
237; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
238; GFX7-NEXT:    ds_write_b8 v0, v5 offset:2
239; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
240; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
241; GFX7-NEXT:    v_bfe_u32 v5, v2, 8, 8
242; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
243; GFX7-NEXT:    ds_write_b8 v0, v5 offset:5
244; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
245; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
246; GFX7-NEXT:    ds_write_b8 v0, v2 offset:7
247; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
248; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
249; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
250; GFX7-NEXT:    ds_write_b8 v0, v2 offset:9
251; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
252; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
253; GFX7-NEXT:    ds_write_b8 v0, v2 offset:11
254; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
255; GFX7-NEXT:    v_bfe_u32 v2, v4, 8, 8
256; GFX7-NEXT:    ds_write_b8 v0, v4 offset:12
257; GFX7-NEXT:    ds_write_b8 v0, v2 offset:13
258; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v4
259; GFX7-NEXT:    ds_write_b8 v0, v1 offset:14
260; GFX7-NEXT:    ds_write_b8 v0, v2 offset:15
261; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
262; GFX7-NEXT:    s_setpc_b64 s[30:31]
263;
264; GFX10-LABEL: store_lds_v4i32_align1:
265; GFX10:       ; %bb.0:
266; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
267; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
268; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
269; GFX10-NEXT:    v_lshrrev_b16 v6, 8, v1
270; GFX10-NEXT:    ds_write_b8 v0, v1
271; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
272; GFX10-NEXT:    v_lshrrev_b16 v8, 8, v2
273; GFX10-NEXT:    v_lshrrev_b16 v7, 8, v5
274; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
275; GFX10-NEXT:    ds_write_b8 v0, v6 offset:1
276; GFX10-NEXT:    ds_write_b8 v0, v5 offset:2
277; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
278; GFX10-NEXT:    v_lshrrev_b16 v2, 8, v1
279; GFX10-NEXT:    ds_write_b8 v0, v8 offset:5
280; GFX10-NEXT:    ds_write_b8 v0, v1 offset:6
281; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
282; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v3
283; GFX10-NEXT:    ds_write_b8 v0, v2 offset:7
284; GFX10-NEXT:    ds_write_b8 v0, v3 offset:8
285; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
286; GFX10-NEXT:    v_lshrrev_b16 v3, 8, v1
287; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
288; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v4
289; GFX10-NEXT:    ds_write_b8 v0, v1 offset:10
290; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v2
291; GFX10-NEXT:    ds_write_b8 v0, v3 offset:11
292; GFX10-NEXT:    ds_write_b8 v0, v4 offset:12
293; GFX10-NEXT:    ds_write_b8 v0, v5 offset:13
294; GFX10-NEXT:    ds_write_b8 v0, v2 offset:14
295; GFX10-NEXT:    ds_write_b8 v0, v1 offset:15
296; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX10-NEXT:    s_setpc_b64 s[30:31]
298  store <4 x i32> %x, <4 x i32> addrspace(3)* %out, align 1
299  ret void
300}
301
302define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
303; GFX9-LABEL: store_lds_v3i32_align1:
304; GFX9:       ; %bb.0:
305; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
306; GFX9-NEXT:    ds_write_b96 v0, v[1:3]
307; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
308; GFX9-NEXT:    s_setpc_b64 s[30:31]
309;
310; GFX7-LABEL: store_lds_v3i32_align1:
311; GFX7:       ; %bb.0:
312; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313; GFX7-NEXT:    s_mov_b32 m0, -1
314; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
315; GFX7-NEXT:    v_bfe_u32 v5, v1, 8, 8
316; GFX7-NEXT:    ds_write_b8 v0, v1
317; GFX7-NEXT:    ds_write_b8 v0, v5 offset:1
318; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 24, v1
319; GFX7-NEXT:    ds_write_b8 v0, v4 offset:2
320; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
321; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
322; GFX7-NEXT:    v_bfe_u32 v4, v2, 8, 8
323; GFX7-NEXT:    ds_write_b8 v0, v2 offset:4
324; GFX7-NEXT:    ds_write_b8 v0, v4 offset:5
325; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
326; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
327; GFX7-NEXT:    ds_write_b8 v0, v2 offset:7
328; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
329; GFX7-NEXT:    v_bfe_u32 v2, v3, 8, 8
330; GFX7-NEXT:    ds_write_b8 v0, v3 offset:8
331; GFX7-NEXT:    ds_write_b8 v0, v2 offset:9
332; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
333; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
334; GFX7-NEXT:    ds_write_b8 v0, v2 offset:11
335; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX7-NEXT:    s_setpc_b64 s[30:31]
337;
338; GFX10-LABEL: store_lds_v3i32_align1:
339; GFX10:       ; %bb.0:
340; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
341; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
342; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
343; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v1
344; GFX10-NEXT:    ds_write_b8 v0, v1
345; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
346; GFX10-NEXT:    v_lshrrev_b16 v6, 8, v2
347; GFX10-NEXT:    v_lshrrev_b16 v7, 8, v4
348; GFX10-NEXT:    ds_write_b8 v0, v2 offset:4
349; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
350; GFX10-NEXT:    ds_write_b8 v0, v5 offset:1
351; GFX10-NEXT:    ds_write_b8 v0, v4 offset:2
352; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
353; GFX10-NEXT:    v_lshrrev_b16 v4, 8, v1
354; GFX10-NEXT:    v_lshrrev_b16 v5, 8, v3
355; GFX10-NEXT:    ds_write_b8 v0, v1 offset:6
356; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v2
357; GFX10-NEXT:    ds_write_b8 v0, v6 offset:5
358; GFX10-NEXT:    ds_write_b8 v0, v4 offset:7
359; GFX10-NEXT:    ds_write_b8 v0, v3 offset:8
360; GFX10-NEXT:    ds_write_b8 v0, v5 offset:9
361; GFX10-NEXT:    ds_write_b8 v0, v2 offset:10
362; GFX10-NEXT:    ds_write_b8 v0, v1 offset:11
363; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX10-NEXT:    s_setpc_b64 s[30:31]
365  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
366  ret void
367}
368
369define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) {
370; GFX9-LABEL: test_s_load_constant_v8i32_align1:
371; GFX9:       ; %bb.0:
372; GFX9-NEXT:    v_mov_b32_e32 v8, 0
373; GFX9-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
374; GFX9-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
375; GFX9-NEXT:    s_waitcnt vmcnt(1)
376; GFX9-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
377; GFX9-NEXT:    s_waitcnt vmcnt(1)
378; GFX9-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
379; GFX9-NEXT:    s_endpgm
380;
381; GFX7-LABEL: test_s_load_constant_v8i32_align1:
382; GFX7:       ; %bb.0:
383; GFX7-NEXT:    s_mov_b32 s4, s2
384; GFX7-NEXT:    s_mov_b32 s5, s3
385; GFX7-NEXT:    s_mov_b32 s2, -1
386; GFX7-NEXT:    s_mov_b32 s3, 0xf000
387; GFX7-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
388; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
389; GFX7-NEXT:    s_mov_b64 s[6:7], s[2:3]
390; GFX7-NEXT:    s_waitcnt vmcnt(1)
391; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
392; GFX7-NEXT:    s_waitcnt vmcnt(1)
393; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
394; GFX7-NEXT:    s_endpgm
395;
396; GFX10-LABEL: test_s_load_constant_v8i32_align1:
397; GFX10:       ; %bb.0:
398; GFX10-NEXT:    v_mov_b32_e32 v8, 0
399; GFX10-NEXT:    s_clause 0x1
400; GFX10-NEXT:    global_load_dwordx4 v[0:3], v8, s[0:1]
401; GFX10-NEXT:    global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
402; GFX10-NEXT:    s_waitcnt vmcnt(1)
403; GFX10-NEXT:    global_store_dwordx4 v8, v[0:3], s[2:3]
404; GFX10-NEXT:    s_waitcnt vmcnt(0)
405; GFX10-NEXT:    global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
406; GFX10-NEXT:    s_endpgm
407  %load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1
408  store <8 x i32> %load, <8 x i32> addrspace(1)* %out
409  ret void
410}
411