1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6
7define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
8; GFX9-LABEL: load_lds_v4i32:
9; GFX9:       ; %bb.0:
10; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
11; GFX9-NEXT:    ds_read_b128 v[0:3], v0
12; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
13; GFX9-NEXT:    s_setpc_b64 s[30:31]
14;
15; GFX7-LABEL: load_lds_v4i32:
16; GFX7:       ; %bb.0:
17; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
18; GFX7-NEXT:    s_mov_b32 m0, -1
19; GFX7-NEXT:    ds_read_b128 v[0:3], v0
20; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
21; GFX7-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX6-LABEL: load_lds_v4i32:
24; GFX6:       ; %bb.0:
25; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GFX6-NEXT:    v_mov_b32_e32 v2, v0
27; GFX6-NEXT:    s_mov_b32 m0, -1
28; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
29; GFX6-NEXT:    ds_read_b64 v[0:1], v0
30; GFX6-NEXT:    ds_read_b64 v[2:3], v2
31; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX6-NEXT:    s_setpc_b64 s[30:31]
33;
34; GFX10-LABEL: load_lds_v4i32:
35; GFX10:       ; %bb.0:
36; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
38; GFX10-NEXT:    ds_read_b128 v[0:3], v0
39; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX10-NEXT:    s_setpc_b64 s[30:31]
41  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
42  ret <4 x i32> %load
43}
44
45define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
46; GFX9-LABEL: load_lds_v4i32_align1:
47; GFX9:       ; %bb.0:
48; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49; GFX9-NEXT:    ds_read_u8 v1, v0
50; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
51; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
52; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
53; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
54; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
55; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
56; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
57; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
58; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
59; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
60; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
61; GFX9-NEXT:    ds_read_u8 v13, v0 offset:12
62; GFX9-NEXT:    ds_read_u8 v14, v0 offset:13
63; GFX9-NEXT:    ds_read_u8 v15, v0 offset:14
64; GFX9-NEXT:    ds_read_u8 v16, v0 offset:15
65; GFX9-NEXT:    s_waitcnt lgkmcnt(14)
66; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
67; GFX9-NEXT:    s_waitcnt lgkmcnt(12)
68; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
69; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
70; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
71; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
72; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
73; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
74; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
75; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
76; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
77; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
78; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
79; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
80; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
81; GFX9-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
82; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX9-NEXT:    v_lshl_or_b32 v4, v16, 8, v15
84; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
85; GFX9-NEXT:    s_setpc_b64 s[30:31]
86;
87; GFX7-LABEL: load_lds_v4i32_align1:
88; GFX7:       ; %bb.0:
89; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90; GFX7-NEXT:    s_mov_b32 m0, -1
91; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
92; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
93; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
94; GFX7-NEXT:    ds_read_u8 v4, v0 offset:1
95; GFX7-NEXT:    ds_read_u8 v5, v0
96; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
97; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
98; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
99; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
100; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
101; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
102; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
103; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
104; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
105; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
106; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
107; GFX7-NEXT:    v_or_b32_e32 v4, v3, v4
108; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
109; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
110; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
111; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
112; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
113; GFX7-NEXT:    ds_read_u8 v5, v0 offset:15
114; GFX7-NEXT:    ds_read_u8 v6, v0 offset:14
115; GFX7-NEXT:    ds_read_u8 v7, v0 offset:13
116; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
117; GFX7-NEXT:    ds_read_u8 v9, v0 offset:11
118; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
119; GFX7-NEXT:    ds_read_u8 v11, v0 offset:9
120; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
121; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
122; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
123; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
124; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
125; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
126; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
128; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
129; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
130; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
131; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
132; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
133; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
134; GFX7-NEXT:    v_or_b32_e32 v3, v3, v6
135; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
136; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
137; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
138; GFX7-NEXT:    v_mov_b32_e32 v0, v4
139; GFX7-NEXT:    s_setpc_b64 s[30:31]
140;
141; GFX6-LABEL: load_lds_v4i32_align1:
142; GFX6:       ; %bb.0:
143; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
144; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
145; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
146; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
147; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
148; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
149; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
150; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
151; GFX6-NEXT:    s_mov_b32 m0, -1
152; GFX6-NEXT:    ds_read_u8 v1, v1
153; GFX6-NEXT:    ds_read_u8 v2, v2
154; GFX6-NEXT:    ds_read_u8 v3, v3
155; GFX6-NEXT:    ds_read_u8 v4, v4
156; GFX6-NEXT:    ds_read_u8 v5, v5
157; GFX6-NEXT:    ds_read_u8 v6, v6
158; GFX6-NEXT:    ds_read_u8 v7, v7
159; GFX6-NEXT:    ds_read_u8 v8, v0
160; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
161; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
162; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
163; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
164; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
165; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
166; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
167; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
168; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
169; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
170; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
171; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
172; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
173; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
174; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
175; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
176; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
177; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 13, v0
178; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
179; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 15, v0
180; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 14, v0
181; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 3, v0
182; GFX6-NEXT:    v_add_i32_e32 v11, vcc, 2, v0
183; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
184; GFX6-NEXT:    ds_read_u8 v4, v4
185; GFX6-NEXT:    ds_read_u8 v5, v5
186; GFX6-NEXT:    ds_read_u8 v6, v6
187; GFX6-NEXT:    ds_read_u8 v7, v7
188; GFX6-NEXT:    ds_read_u8 v9, v9
189; GFX6-NEXT:    ds_read_u8 v10, v10
190; GFX6-NEXT:    ds_read_u8 v11, v11
191; GFX6-NEXT:    ds_read_u8 v0, v0
192; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
193; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
194; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
195; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
196; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
197; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
198; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
199; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
200; GFX6-NEXT:    v_or_b32_e32 v4, v4, v9
201; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
202; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
203; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
204; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
205; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
206; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
207; GFX6-NEXT:    v_or_b32_e32 v4, v4, v11
208; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
209; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
210; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
211; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
212; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
213; GFX6-NEXT:    s_setpc_b64 s[30:31]
214;
215; GFX10-LABEL: load_lds_v4i32_align1:
216; GFX10:       ; %bb.0:
217; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
218; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
219; GFX10-NEXT:    ds_read_u8 v1, v0
220; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
221; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
222; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
223; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
224; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
225; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
226; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
227; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
228; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
229; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
230; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
231; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
232; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
233; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
234; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
235; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
236; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
237; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
238; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
239; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
240; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
241; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
242; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
243; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
244; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
245; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
246; GFX10-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
247; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
248; GFX10-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
249; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
250; GFX10-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
251; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
252; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
253; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
254; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
255; GFX10-NEXT:    s_setpc_b64 s[30:31]
256  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
257  ret <4 x i32> %load
258}
259
260define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
261; GFX9-LABEL: load_lds_v4i32_align2:
262; GFX9:       ; %bb.0:
263; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
264; GFX9-NEXT:    ds_read_u16 v1, v0
265; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
266; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
267; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
268; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
269; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
270; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
271; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
272; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
273; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
274; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
275; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
276; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
277; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
280; GFX9-NEXT:    s_setpc_b64 s[30:31]
281;
282; GFX7-LABEL: load_lds_v4i32_align2:
283; GFX7:       ; %bb.0:
284; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GFX7-NEXT:    s_mov_b32 m0, -1
286; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
287; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
288; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
289; GFX7-NEXT:    ds_read_u16 v4, v0 offset:2
290; GFX7-NEXT:    ds_read_u16 v5, v0
291; GFX7-NEXT:    ds_read_u16 v6, v0 offset:6
292; GFX7-NEXT:    ds_read_u16 v7, v0 offset:10
293; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
294; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
295; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
296; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
297; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
298; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
299; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
300; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
301; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
302; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
303; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
304; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
305; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
306; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
307; GFX7-NEXT:    s_setpc_b64 s[30:31]
308;
309; GFX6-LABEL: load_lds_v4i32_align2:
310; GFX6:       ; %bb.0:
311; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
312; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
313; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
314; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
315; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
316; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 14, v0
317; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
318; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 2, v0
319; GFX6-NEXT:    s_mov_b32 m0, -1
320; GFX6-NEXT:    ds_read_u16 v1, v1
321; GFX6-NEXT:    ds_read_u16 v2, v2
322; GFX6-NEXT:    ds_read_u16 v3, v3
323; GFX6-NEXT:    ds_read_u16 v4, v4
324; GFX6-NEXT:    ds_read_u16 v5, v5
325; GFX6-NEXT:    ds_read_u16 v6, v6
326; GFX6-NEXT:    ds_read_u16 v7, v7
327; GFX6-NEXT:    ds_read_u16 v0, v0
328; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
329; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
330; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
331; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
332; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
333; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
334; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
335; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
336; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
337; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
338; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
339; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
340; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
341; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
343; GFX6-NEXT:    s_setpc_b64 s[30:31]
344;
345; GFX10-LABEL: load_lds_v4i32_align2:
346; GFX10:       ; %bb.0:
347; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
349; GFX10-NEXT:    ds_read_u16 v1, v0
350; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
351; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
352; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
353; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
354; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
355; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
356; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
357; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
358; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
359; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
360; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
361; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
362; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
363; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
365; GFX10-NEXT:    s_setpc_b64 s[30:31]
366  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
367  ret <4 x i32> %load
368}
369
370define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
371; GFX9-LABEL: load_lds_v4i32_align4:
372; GFX9:       ; %bb.0:
373; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
374; GFX9-NEXT:    v_mov_b32_e32 v2, v0
375; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
376; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
377; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
378; GFX9-NEXT:    s_setpc_b64 s[30:31]
379;
380; GFX7-LABEL: load_lds_v4i32_align4:
381; GFX7:       ; %bb.0:
382; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GFX7-NEXT:    v_mov_b32_e32 v2, v0
384; GFX7-NEXT:    s_mov_b32 m0, -1
385; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
386; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
387; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX7-NEXT:    s_setpc_b64 s[30:31]
389;
390; GFX6-LABEL: load_lds_v4i32_align4:
391; GFX6:       ; %bb.0:
392; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
393; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
394; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
395; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
396; GFX6-NEXT:    s_mov_b32 m0, -1
397; GFX6-NEXT:    ds_read_b32 v2, v2
398; GFX6-NEXT:    ds_read_b32 v3, v3
399; GFX6-NEXT:    ds_read_b32 v0, v0
400; GFX6-NEXT:    ds_read_b32 v1, v1
401; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX6-NEXT:    s_setpc_b64 s[30:31]
403;
404; GFX10-LABEL: load_lds_v4i32_align4:
405; GFX10:       ; %bb.0:
406; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
407; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
408; GFX10-NEXT:    v_mov_b32_e32 v2, v0
409; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
410; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
411; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX10-NEXT:    s_setpc_b64 s[30:31]
413  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
414  ret <4 x i32> %load
415}
416
417define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
418; GFX9-LABEL: load_lds_v4i32_align8:
419; GFX9:       ; %bb.0:
420; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
421; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
422; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX9-NEXT:    s_setpc_b64 s[30:31]
424;
425; GFX7-LABEL: load_lds_v4i32_align8:
426; GFX7:       ; %bb.0:
427; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
428; GFX7-NEXT:    s_mov_b32 m0, -1
429; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
430; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX7-NEXT:    s_setpc_b64 s[30:31]
432;
433; GFX6-LABEL: load_lds_v4i32_align8:
434; GFX6:       ; %bb.0:
435; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
436; GFX6-NEXT:    v_mov_b32_e32 v2, v0
437; GFX6-NEXT:    s_mov_b32 m0, -1
438; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
439; GFX6-NEXT:    ds_read_b64 v[0:1], v0
440; GFX6-NEXT:    ds_read_b64 v[2:3], v2
441; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX6-NEXT:    s_setpc_b64 s[30:31]
443;
444; GFX10-LABEL: load_lds_v4i32_align8:
445; GFX10:       ; %bb.0:
446; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
447; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
448; GFX10-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
449; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX10-NEXT:    s_setpc_b64 s[30:31]
451  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
452  ret <4 x i32> %load
453}
454
455define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
456; GFX9-LABEL: load_lds_v4i32_align16:
457; GFX9:       ; %bb.0:
458; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
459; GFX9-NEXT:    ds_read_b128 v[0:3], v0
460; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX9-NEXT:    s_setpc_b64 s[30:31]
462;
463; GFX7-LABEL: load_lds_v4i32_align16:
464; GFX7:       ; %bb.0:
465; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
466; GFX7-NEXT:    s_mov_b32 m0, -1
467; GFX7-NEXT:    ds_read_b128 v[0:3], v0
468; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX7-NEXT:    s_setpc_b64 s[30:31]
470;
471; GFX6-LABEL: load_lds_v4i32_align16:
472; GFX6:       ; %bb.0:
473; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
474; GFX6-NEXT:    v_mov_b32_e32 v2, v0
475; GFX6-NEXT:    s_mov_b32 m0, -1
476; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
477; GFX6-NEXT:    ds_read_b64 v[0:1], v0
478; GFX6-NEXT:    ds_read_b64 v[2:3], v2
479; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX6-NEXT:    s_setpc_b64 s[30:31]
481;
482; GFX10-LABEL: load_lds_v4i32_align16:
483; GFX10:       ; %bb.0:
484; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
485; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
486; GFX10-NEXT:    ds_read_b128 v[0:3], v0
487; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX10-NEXT:    s_setpc_b64 s[30:31]
489  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
490  ret <4 x i32> %load
491}
492