1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define <4 x i32> @load_lds_v4i32(<4 x i32> addrspace(3)* %ptr) {
9; GFX9-LABEL: load_lds_v4i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    ds_read_b128 v[0:3], v0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX7-LABEL: load_lds_v4i32:
17; GFX7:       ; %bb.0:
18; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b32 m0, -1
20; GFX7-NEXT:    ds_read_b128 v[0:3], v0
21; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX7-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX6-LABEL: load_lds_v4i32:
25; GFX6:       ; %bb.0:
26; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX6-NEXT:    v_mov_b32_e32 v2, v0
28; GFX6-NEXT:    s_mov_b32 m0, -1
29; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
30; GFX6-NEXT:    ds_read_b64 v[0:1], v0
31; GFX6-NEXT:    ds_read_b64 v[2:3], v2
32; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX6-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10-LABEL: load_lds_v4i32:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
39; GFX10-NEXT:    ds_read_b128 v[0:3], v0
40; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX10-NEXT:    s_setpc_b64 s[30:31]
42;
43; GFX11-LABEL: load_lds_v4i32:
44; GFX11:       ; %bb.0:
45; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
47; GFX11-NEXT:    ds_load_b128 v[0:3], v0
48; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX11-NEXT:    s_setpc_b64 s[30:31]
50  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr
51  ret <4 x i32> %load
52}
53
54define <4 x i32> @load_lds_v4i32_align1(<4 x i32> addrspace(3)* %ptr) {
55; GFX9-LABEL: load_lds_v4i32_align1:
56; GFX9:       ; %bb.0:
57; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX9-NEXT:    ds_read_u8 v1, v0
59; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
60; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
61; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
62; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
63; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
64; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
65; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
66; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
67; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
68; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
69; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
70; GFX9-NEXT:    ds_read_u8 v13, v0 offset:12
71; GFX9-NEXT:    ds_read_u8 v14, v0 offset:13
72; GFX9-NEXT:    ds_read_u8 v15, v0 offset:14
73; GFX9-NEXT:    ds_read_u8 v16, v0 offset:15
74; GFX9-NEXT:    s_waitcnt lgkmcnt(14)
75; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
76; GFX9-NEXT:    s_waitcnt lgkmcnt(12)
77; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
78; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
79; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
80; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
81; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
82; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
83; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
84; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
85; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
86; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
87; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
88; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
89; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
90; GFX9-NEXT:    v_lshl_or_b32 v3, v14, 8, v13
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:    v_lshl_or_b32 v4, v16, 8, v15
93; GFX9-NEXT:    v_lshl_or_b32 v3, v4, 16, v3
94; GFX9-NEXT:    s_setpc_b64 s[30:31]
95;
96; GFX7-LABEL: load_lds_v4i32_align1:
97; GFX7:       ; %bb.0:
98; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
99; GFX7-NEXT:    s_mov_b32 m0, -1
100; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
101; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
102; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
103; GFX7-NEXT:    ds_read_u8 v4, v0 offset:1
104; GFX7-NEXT:    ds_read_u8 v5, v0
105; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
106; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
107; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
108; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
109; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
110; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
111; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
112; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
113; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
114; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
115; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
116; GFX7-NEXT:    v_or_b32_e32 v4, v3, v4
117; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
118; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
119; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
120; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v8
122; GFX7-NEXT:    ds_read_u8 v5, v0 offset:15
123; GFX7-NEXT:    ds_read_u8 v6, v0 offset:14
124; GFX7-NEXT:    ds_read_u8 v7, v0 offset:13
125; GFX7-NEXT:    ds_read_u8 v8, v0 offset:12
126; GFX7-NEXT:    ds_read_u8 v9, v0 offset:11
127; GFX7-NEXT:    ds_read_u8 v10, v0 offset:10
128; GFX7-NEXT:    ds_read_u8 v11, v0 offset:9
129; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
130; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
131; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
132; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
133; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
134; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v11
135; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
137; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v9
138; GFX7-NEXT:    v_or_b32_e32 v2, v2, v10
139; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
140; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
141; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
142; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 8, v7
143; GFX7-NEXT:    v_or_b32_e32 v3, v3, v6
144; GFX7-NEXT:    v_or_b32_e32 v0, v0, v8
145; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
146; GFX7-NEXT:    v_or_b32_e32 v3, v3, v0
147; GFX7-NEXT:    v_mov_b32_e32 v0, v4
148; GFX7-NEXT:    s_setpc_b64 s[30:31]
149;
150; GFX6-LABEL: load_lds_v4i32_align1:
151; GFX6:       ; %bb.0:
152; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
153; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
154; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
155; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
156; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
157; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
158; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
159; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
160; GFX6-NEXT:    s_mov_b32 m0, -1
161; GFX6-NEXT:    ds_read_u8 v1, v1
162; GFX6-NEXT:    ds_read_u8 v2, v2
163; GFX6-NEXT:    ds_read_u8 v3, v3
164; GFX6-NEXT:    ds_read_u8 v4, v4
165; GFX6-NEXT:    ds_read_u8 v5, v5
166; GFX6-NEXT:    ds_read_u8 v6, v6
167; GFX6-NEXT:    ds_read_u8 v7, v7
168; GFX6-NEXT:    ds_read_u8 v8, v0
169; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
170; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
171; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
172; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
173; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
174; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
175; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
176; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
177; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
178; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
179; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
180; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
181; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
182; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
183; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
184; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
185; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
186; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 13, v0
187; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
188; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 15, v0
189; GFX6-NEXT:    v_add_i32_e32 v9, vcc, 14, v0
190; GFX6-NEXT:    v_add_i32_e32 v10, vcc, 3, v0
191; GFX6-NEXT:    v_add_i32_e32 v11, vcc, 2, v0
192; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
193; GFX6-NEXT:    ds_read_u8 v4, v4
194; GFX6-NEXT:    ds_read_u8 v5, v5
195; GFX6-NEXT:    ds_read_u8 v6, v6
196; GFX6-NEXT:    ds_read_u8 v7, v7
197; GFX6-NEXT:    ds_read_u8 v9, v9
198; GFX6-NEXT:    ds_read_u8 v10, v10
199; GFX6-NEXT:    ds_read_u8 v11, v11
200; GFX6-NEXT:    ds_read_u8 v0, v0
201; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
202; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
203; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
204; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
205; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
206; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
207; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
208; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
209; GFX6-NEXT:    v_or_b32_e32 v4, v4, v9
210; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
211; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
212; GFX6-NEXT:    v_or_b32_e32 v3, v4, v3
213; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
214; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 8, v10
215; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
216; GFX6-NEXT:    v_or_b32_e32 v4, v4, v11
217; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
219; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
220; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
221; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
222; GFX6-NEXT:    s_setpc_b64 s[30:31]
223;
224; GFX10-LABEL: load_lds_v4i32_align1:
225; GFX10:       ; %bb.0:
226; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
227; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
228; GFX10-NEXT:    ds_read_u8 v1, v0
229; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
230; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
231; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
232; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
233; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
234; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
235; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
236; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
237; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
238; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
239; GFX10-NEXT:    ds_read_u8 v12, v0 offset:11
240; GFX10-NEXT:    ds_read_u8 v13, v0 offset:12
241; GFX10-NEXT:    ds_read_u8 v14, v0 offset:13
242; GFX10-NEXT:    ds_read_u8 v15, v0 offset:14
243; GFX10-NEXT:    ds_read_u8 v0, v0 offset:15
244; GFX10-NEXT:    s_waitcnt lgkmcnt(14)
245; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
246; GFX10-NEXT:    s_waitcnt lgkmcnt(12)
247; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
248; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
249; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
250; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
251; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
252; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
253; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
254; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
255; GFX10-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
256; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
257; GFX10-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
258; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
259; GFX10-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
260; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
261; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
262; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
263; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
264; GFX10-NEXT:    s_setpc_b64 s[30:31]
265;
266; GFX11-LABEL: load_lds_v4i32_align1:
267; GFX11:       ; %bb.0:
268; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
269; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
270; GFX11-NEXT:    ds_load_u8 v1, v0
271; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
272; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
273; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
274; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
275; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
276; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
277; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
278; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
279; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
280; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
281; GFX11-NEXT:    ds_load_u8 v12, v0 offset:11
282; GFX11-NEXT:    ds_load_u8 v13, v0 offset:12
283; GFX11-NEXT:    ds_load_u8 v14, v0 offset:13
284; GFX11-NEXT:    ds_load_u8 v15, v0 offset:14
285; GFX11-NEXT:    ds_load_u8 v0, v0 offset:15
286; GFX11-NEXT:    s_waitcnt lgkmcnt(14)
287; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
288; GFX11-NEXT:    s_waitcnt lgkmcnt(12)
289; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
290; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
291; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
292; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
293; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
294; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
295; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
296; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
297; GFX11-NEXT:    v_lshl_or_b32 v6, v12, 8, v11
298; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
299; GFX11-NEXT:    v_lshl_or_b32 v7, v14, 8, v13
300; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX11-NEXT:    v_lshl_or_b32 v8, v0, 8, v15
302; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
303; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
304; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
305; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
306; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
307; GFX11-NEXT:    s_setpc_b64 s[30:31]
308  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 1
309  ret <4 x i32> %load
310}
311
312define <4 x i32> @load_lds_v4i32_align2(<4 x i32> addrspace(3)* %ptr) {
313; GFX9-LABEL: load_lds_v4i32_align2:
314; GFX9:       ; %bb.0:
315; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
316; GFX9-NEXT:    ds_read_u16 v1, v0
317; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
318; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
319; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
320; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
321; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
322; GFX9-NEXT:    ds_read_u16 v7, v0 offset:12
323; GFX9-NEXT:    ds_read_u16 v8, v0 offset:14
324; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
325; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
326; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
327; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
328; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
329; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX9-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
332; GFX9-NEXT:    s_setpc_b64 s[30:31]
333;
334; GFX7-LABEL: load_lds_v4i32_align2:
335; GFX7:       ; %bb.0:
336; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
337; GFX7-NEXT:    s_mov_b32 m0, -1
338; GFX7-NEXT:    ds_read_u16 v3, v0 offset:12
339; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
340; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
341; GFX7-NEXT:    ds_read_u16 v4, v0 offset:2
342; GFX7-NEXT:    ds_read_u16 v5, v0
343; GFX7-NEXT:    ds_read_u16 v6, v0 offset:6
344; GFX7-NEXT:    ds_read_u16 v7, v0 offset:10
345; GFX7-NEXT:    ds_read_u16 v8, v0 offset:14
346; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
347; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v4
348; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
349; GFX7-NEXT:    v_or_b32_e32 v0, v0, v5
350; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
351; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
352; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
353; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
354; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
355; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
356; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
358; GFX7-NEXT:    v_or_b32_e32 v3, v4, v3
359; GFX7-NEXT:    s_setpc_b64 s[30:31]
360;
361; GFX6-LABEL: load_lds_v4i32_align2:
362; GFX6:       ; %bb.0:
363; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
364; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
365; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
366; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
367; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
368; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 14, v0
369; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 12, v0
370; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 2, v0
371; GFX6-NEXT:    s_mov_b32 m0, -1
372; GFX6-NEXT:    ds_read_u16 v1, v1
373; GFX6-NEXT:    ds_read_u16 v2, v2
374; GFX6-NEXT:    ds_read_u16 v3, v3
375; GFX6-NEXT:    ds_read_u16 v4, v4
376; GFX6-NEXT:    ds_read_u16 v5, v5
377; GFX6-NEXT:    ds_read_u16 v6, v6
378; GFX6-NEXT:    ds_read_u16 v7, v7
379; GFX6-NEXT:    ds_read_u16 v0, v0
380; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
381; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
382; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
383; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
384; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
385; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
386; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
387; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
388; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
389; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
390; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
391; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
392; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
393; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
394; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
395; GFX6-NEXT:    s_setpc_b64 s[30:31]
396;
397; GFX10-LABEL: load_lds_v4i32_align2:
398; GFX10:       ; %bb.0:
399; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
400; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
401; GFX10-NEXT:    ds_read_u16 v1, v0
402; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
403; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
404; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
405; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
406; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
407; GFX10-NEXT:    ds_read_u16 v7, v0 offset:12
408; GFX10-NEXT:    ds_read_u16 v8, v0 offset:14
409; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
410; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
411; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
412; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
413; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
414; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
415; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX10-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
417; GFX10-NEXT:    s_setpc_b64 s[30:31]
418;
419; GFX11-LABEL: load_lds_v4i32_align2:
420; GFX11:       ; %bb.0:
421; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
422; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
423; GFX11-NEXT:    ds_load_u16 v1, v0
424; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
425; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
426; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
427; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
428; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
429; GFX11-NEXT:    ds_load_u16 v7, v0 offset:12
430; GFX11-NEXT:    ds_load_u16 v8, v0 offset:14
431; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
432; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
433; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
434; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
435; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
436; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
437; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX11-NEXT:    v_lshl_or_b32 v3, v8, 16, v7
439; GFX11-NEXT:    s_setpc_b64 s[30:31]
440  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 2
441  ret <4 x i32> %load
442}
443
444define <4 x i32> @load_lds_v4i32_align4(<4 x i32> addrspace(3)* %ptr) {
445; GFX9-LABEL: load_lds_v4i32_align4:
446; GFX9:       ; %bb.0:
447; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
448; GFX9-NEXT:    v_mov_b32_e32 v2, v0
449; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
450; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
452; GFX9-NEXT:    s_setpc_b64 s[30:31]
453;
454; GFX7-LABEL: load_lds_v4i32_align4:
455; GFX7:       ; %bb.0:
456; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
457; GFX7-NEXT:    v_mov_b32_e32 v2, v0
458; GFX7-NEXT:    s_mov_b32 m0, -1
459; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
460; GFX7-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
461; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
462; GFX7-NEXT:    s_setpc_b64 s[30:31]
463;
464; GFX6-LABEL: load_lds_v4i32_align4:
465; GFX6:       ; %bb.0:
466; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
467; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
468; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
469; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
470; GFX6-NEXT:    s_mov_b32 m0, -1
471; GFX6-NEXT:    ds_read_b32 v2, v2
472; GFX6-NEXT:    ds_read_b32 v3, v3
473; GFX6-NEXT:    ds_read_b32 v0, v0
474; GFX6-NEXT:    ds_read_b32 v1, v1
475; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX6-NEXT:    s_setpc_b64 s[30:31]
477;
478; GFX10-LABEL: load_lds_v4i32_align4:
479; GFX10:       ; %bb.0:
480; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
481; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
482; GFX10-NEXT:    v_mov_b32_e32 v2, v0
483; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
484; GFX10-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
485; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
486; GFX10-NEXT:    s_setpc_b64 s[30:31]
487;
488; GFX11-LABEL: load_lds_v4i32_align4:
489; GFX11:       ; %bb.0:
490; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX11-NEXT:    v_mov_b32_e32 v2, v0
493; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
494; GFX11-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:2 offset1:3
495; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX11-NEXT:    s_setpc_b64 s[30:31]
497  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
498  ret <4 x i32> %load
499}
500
501define <4 x i32> @load_lds_v4i32_align8(<4 x i32> addrspace(3)* %ptr) {
502; GFX9-LABEL: load_lds_v4i32_align8:
503; GFX9:       ; %bb.0:
504; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505; GFX9-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
506; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
507; GFX9-NEXT:    s_setpc_b64 s[30:31]
508;
509; GFX7-LABEL: load_lds_v4i32_align8:
510; GFX7:       ; %bb.0:
511; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
512; GFX7-NEXT:    s_mov_b32 m0, -1
513; GFX7-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
514; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX7-NEXT:    s_setpc_b64 s[30:31]
516;
517; GFX6-LABEL: load_lds_v4i32_align8:
518; GFX6:       ; %bb.0:
519; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
520; GFX6-NEXT:    v_mov_b32_e32 v2, v0
521; GFX6-NEXT:    s_mov_b32 m0, -1
522; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
523; GFX6-NEXT:    ds_read_b64 v[0:1], v0
524; GFX6-NEXT:    ds_read_b64 v[2:3], v2
525; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX6-NEXT:    s_setpc_b64 s[30:31]
527;
528; GFX10-LABEL: load_lds_v4i32_align8:
529; GFX10:       ; %bb.0:
530; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
531; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
532; GFX10-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
533; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
534; GFX10-NEXT:    s_setpc_b64 s[30:31]
535;
536; GFX11-LABEL: load_lds_v4i32_align8:
537; GFX11:       ; %bb.0:
538; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
539; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
540; GFX11-NEXT:    ds_load_2addr_b64 v[0:3], v0 offset1:1
541; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX11-NEXT:    s_setpc_b64 s[30:31]
543  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
544  ret <4 x i32> %load
545}
546
547define <4 x i32> @load_lds_v4i32_align16(<4 x i32> addrspace(3)* %ptr) {
548; GFX9-LABEL: load_lds_v4i32_align16:
549; GFX9:       ; %bb.0:
550; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
551; GFX9-NEXT:    ds_read_b128 v[0:3], v0
552; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX9-NEXT:    s_setpc_b64 s[30:31]
554;
555; GFX7-LABEL: load_lds_v4i32_align16:
556; GFX7:       ; %bb.0:
557; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
558; GFX7-NEXT:    s_mov_b32 m0, -1
559; GFX7-NEXT:    ds_read_b128 v[0:3], v0
560; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX7-NEXT:    s_setpc_b64 s[30:31]
562;
563; GFX6-LABEL: load_lds_v4i32_align16:
564; GFX6:       ; %bb.0:
565; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
566; GFX6-NEXT:    v_mov_b32_e32 v2, v0
567; GFX6-NEXT:    s_mov_b32 m0, -1
568; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
569; GFX6-NEXT:    ds_read_b64 v[0:1], v0
570; GFX6-NEXT:    ds_read_b64 v[2:3], v2
571; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX6-NEXT:    s_setpc_b64 s[30:31]
573;
574; GFX10-LABEL: load_lds_v4i32_align16:
575; GFX10:       ; %bb.0:
576; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
577; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
578; GFX10-NEXT:    ds_read_b128 v[0:3], v0
579; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX10-NEXT:    s_setpc_b64 s[30:31]
581;
582; GFX11-LABEL: load_lds_v4i32_align16:
583; GFX11:       ; %bb.0:
584; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
585; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
586; GFX11-NEXT:    ds_load_b128 v[0:3], v0
587; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX11-NEXT:    s_setpc_b64 s[30:31]
589  %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 16
590  ret <4 x i32> %load
591}
592