1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define <3 x i32> @load_lds_v3i32(<3 x i32> addrspace(3)* %ptr) {
9; GFX9-LABEL: load_lds_v3i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX9-NEXT:    ds_read_b96 v[0:2], v0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    s_setpc_b64 s[30:31]
15;
16; GFX7-LABEL: load_lds_v3i32:
17; GFX7:       ; %bb.0:
18; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19; GFX7-NEXT:    s_mov_b32 m0, -1
20; GFX7-NEXT:    ds_read_b96 v[0:2], v0
21; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
22; GFX7-NEXT:    s_setpc_b64 s[30:31]
23;
24; GFX6-LABEL: load_lds_v3i32:
25; GFX6:       ; %bb.0:
26; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
27; GFX6-NEXT:    v_mov_b32_e32 v2, v0
28; GFX6-NEXT:    s_mov_b32 m0, -1
29; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
30; GFX6-NEXT:    ds_read_b64 v[0:1], v0
31; GFX6-NEXT:    ds_read_b32 v2, v2
32; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX6-NEXT:    s_setpc_b64 s[30:31]
34;
35; GFX10-LABEL: load_lds_v3i32:
36; GFX10:       ; %bb.0:
37; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
38; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
39; GFX10-NEXT:    ds_read_b96 v[0:2], v0
40; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX10-NEXT:    s_setpc_b64 s[30:31]
42;
43; GFX11-LABEL: load_lds_v3i32:
44; GFX11:       ; %bb.0:
45; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
47; GFX11-NEXT:    ds_load_b96 v[0:2], v0
48; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
49; GFX11-NEXT:    s_setpc_b64 s[30:31]
50  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr
51  ret <3 x i32> %load
52}
53
54define <3 x i32> @load_lds_v3i32_align1(<3 x i32> addrspace(3)* %ptr) {
55; GFX9-LABEL: load_lds_v3i32_align1:
56; GFX9:       ; %bb.0:
57; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
58; GFX9-NEXT:    ds_read_u8 v1, v0
59; GFX9-NEXT:    ds_read_u8 v2, v0 offset:1
60; GFX9-NEXT:    ds_read_u8 v3, v0 offset:2
61; GFX9-NEXT:    ds_read_u8 v4, v0 offset:3
62; GFX9-NEXT:    ds_read_u8 v5, v0 offset:4
63; GFX9-NEXT:    ds_read_u8 v6, v0 offset:5
64; GFX9-NEXT:    ds_read_u8 v7, v0 offset:6
65; GFX9-NEXT:    ds_read_u8 v8, v0 offset:7
66; GFX9-NEXT:    ds_read_u8 v9, v0 offset:8
67; GFX9-NEXT:    ds_read_u8 v10, v0 offset:9
68; GFX9-NEXT:    ds_read_u8 v11, v0 offset:10
69; GFX9-NEXT:    ds_read_u8 v12, v0 offset:11
70; GFX9-NEXT:    s_waitcnt lgkmcnt(10)
71; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 8, v1
72; GFX9-NEXT:    s_waitcnt lgkmcnt(8)
73; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 8, v3
74; GFX9-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
75; GFX9-NEXT:    s_waitcnt lgkmcnt(6)
76; GFX9-NEXT:    v_lshl_or_b32 v1, v6, 8, v5
77; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
78; GFX9-NEXT:    v_lshl_or_b32 v2, v8, 8, v7
79; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
80; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
81; GFX9-NEXT:    v_lshl_or_b32 v2, v10, 8, v9
82; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX9-NEXT:    v_lshl_or_b32 v3, v12, 8, v11
84; GFX9-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
85; GFX9-NEXT:    s_setpc_b64 s[30:31]
86;
87; GFX7-LABEL: load_lds_v3i32_align1:
88; GFX7:       ; %bb.0:
89; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90; GFX7-NEXT:    s_mov_b32 m0, -1
91; GFX7-NEXT:    ds_read_u8 v1, v0 offset:6
92; GFX7-NEXT:    ds_read_u8 v2, v0 offset:4
93; GFX7-NEXT:    ds_read_u8 v3, v0 offset:2
94; GFX7-NEXT:    ds_read_u8 v4, v0 offset:1
95; GFX7-NEXT:    ds_read_u8 v5, v0
96; GFX7-NEXT:    ds_read_u8 v6, v0 offset:3
97; GFX7-NEXT:    ds_read_u8 v7, v0 offset:5
98; GFX7-NEXT:    ds_read_u8 v8, v0 offset:7
99; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
100; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
101; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
102; GFX7-NEXT:    v_or_b32_e32 v4, v4, v5
103; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
104; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 8, v6
105; GFX7-NEXT:    v_or_b32_e32 v3, v5, v3
106; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
107; GFX7-NEXT:    v_or_b32_e32 v3, v3, v4
108; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
109; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v7
110; GFX7-NEXT:    ds_read_u8 v5, v0 offset:11
111; GFX7-NEXT:    ds_read_u8 v6, v0 offset:10
112; GFX7-NEXT:    ds_read_u8 v7, v0 offset:9
113; GFX7-NEXT:    ds_read_u8 v0, v0 offset:8
114; GFX7-NEXT:    v_or_b32_e32 v2, v4, v2
115; GFX7-NEXT:    s_waitcnt lgkmcnt(4)
116; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
117; GFX7-NEXT:    v_or_b32_e32 v1, v4, v1
118; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
119; GFX7-NEXT:    v_or_b32_e32 v1, v1, v2
120; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
121; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v7
122; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX7-NEXT:    v_or_b32_e32 v0, v2, v0
124; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
125; GFX7-NEXT:    v_or_b32_e32 v2, v2, v6
126; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
127; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
128; GFX7-NEXT:    v_mov_b32_e32 v0, v3
129; GFX7-NEXT:    s_setpc_b64 s[30:31]
130;
131; GFX6-LABEL: load_lds_v3i32_align1:
132; GFX6:       ; %bb.0:
133; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
134; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 5, v0
135; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
136; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 7, v0
137; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 6, v0
138; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 9, v0
139; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 8, v0
140; GFX6-NEXT:    v_add_i32_e32 v7, vcc, 11, v0
141; GFX6-NEXT:    s_mov_b32 m0, -1
142; GFX6-NEXT:    ds_read_u8 v1, v1
143; GFX6-NEXT:    ds_read_u8 v2, v2
144; GFX6-NEXT:    ds_read_u8 v3, v3
145; GFX6-NEXT:    ds_read_u8 v4, v4
146; GFX6-NEXT:    ds_read_u8 v5, v5
147; GFX6-NEXT:    ds_read_u8 v6, v6
148; GFX6-NEXT:    ds_read_u8 v7, v7
149; GFX6-NEXT:    ds_read_u8 v8, v0
150; GFX6-NEXT:    s_waitcnt lgkmcnt(7)
151; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
152; GFX6-NEXT:    s_waitcnt lgkmcnt(6)
153; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
154; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
155; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
156; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
157; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
158; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 10, v0
159; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
160; GFX6-NEXT:    ds_read_u8 v4, v4
161; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
162; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
163; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
164; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
165; GFX6-NEXT:    v_or_b32_e32 v2, v2, v6
166; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 3, v0
167; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 2, v0
168; GFX6-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
169; GFX6-NEXT:    ds_read_u8 v5, v5
170; GFX6-NEXT:    ds_read_u8 v6, v6
171; GFX6-NEXT:    ds_read_u8 v0, v0
172; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
173; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
174; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
175; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
176; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
177; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
178; GFX6-NEXT:    s_waitcnt lgkmcnt(2)
179; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v5
180; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
181; GFX6-NEXT:    v_or_b32_e32 v3, v3, v6
182; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
184; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
185; GFX6-NEXT:    v_or_b32_e32 v0, v0, v8
186; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
187; GFX6-NEXT:    s_setpc_b64 s[30:31]
188;
189; GFX10-LABEL: load_lds_v3i32_align1:
190; GFX10:       ; %bb.0:
191; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
192; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
193; GFX10-NEXT:    ds_read_u8 v1, v0
194; GFX10-NEXT:    ds_read_u8 v2, v0 offset:1
195; GFX10-NEXT:    ds_read_u8 v3, v0 offset:2
196; GFX10-NEXT:    ds_read_u8 v4, v0 offset:3
197; GFX10-NEXT:    ds_read_u8 v5, v0 offset:4
198; GFX10-NEXT:    ds_read_u8 v6, v0 offset:5
199; GFX10-NEXT:    ds_read_u8 v7, v0 offset:6
200; GFX10-NEXT:    ds_read_u8 v8, v0 offset:7
201; GFX10-NEXT:    ds_read_u8 v9, v0 offset:8
202; GFX10-NEXT:    ds_read_u8 v10, v0 offset:9
203; GFX10-NEXT:    ds_read_u8 v11, v0 offset:10
204; GFX10-NEXT:    ds_read_u8 v0, v0 offset:11
205; GFX10-NEXT:    s_waitcnt lgkmcnt(10)
206; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
207; GFX10-NEXT:    s_waitcnt lgkmcnt(8)
208; GFX10-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
209; GFX10-NEXT:    s_waitcnt lgkmcnt(6)
210; GFX10-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
211; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
212; GFX10-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
213; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
214; GFX10-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
215; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
216; GFX10-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
217; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
218; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
219; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
220; GFX10-NEXT:    s_setpc_b64 s[30:31]
221;
222; GFX11-LABEL: load_lds_v3i32_align1:
223; GFX11:       ; %bb.0:
224; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
226; GFX11-NEXT:    ds_load_u8 v1, v0
227; GFX11-NEXT:    ds_load_u8 v2, v0 offset:1
228; GFX11-NEXT:    ds_load_u8 v3, v0 offset:2
229; GFX11-NEXT:    ds_load_u8 v4, v0 offset:3
230; GFX11-NEXT:    ds_load_u8 v5, v0 offset:4
231; GFX11-NEXT:    ds_load_u8 v6, v0 offset:5
232; GFX11-NEXT:    ds_load_u8 v7, v0 offset:6
233; GFX11-NEXT:    ds_load_u8 v8, v0 offset:7
234; GFX11-NEXT:    ds_load_u8 v9, v0 offset:8
235; GFX11-NEXT:    ds_load_u8 v10, v0 offset:9
236; GFX11-NEXT:    ds_load_u8 v11, v0 offset:10
237; GFX11-NEXT:    ds_load_u8 v0, v0 offset:11
238; GFX11-NEXT:    s_waitcnt lgkmcnt(10)
239; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
240; GFX11-NEXT:    s_waitcnt lgkmcnt(8)
241; GFX11-NEXT:    v_lshl_or_b32 v2, v4, 8, v3
242; GFX11-NEXT:    s_waitcnt lgkmcnt(6)
243; GFX11-NEXT:    v_lshl_or_b32 v3, v6, 8, v5
244; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
245; GFX11-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
246; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
247; GFX11-NEXT:    v_lshl_or_b32 v5, v10, 8, v9
248; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
249; GFX11-NEXT:    v_lshl_or_b32 v6, v0, 8, v11
250; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
251; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
252; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
253; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
254; GFX11-NEXT:    s_setpc_b64 s[30:31]
255  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 1
256  ret <3 x i32> %load
257}
258
259define <3 x i32> @load_lds_v3i32_align2(<3 x i32> addrspace(3)* %ptr) {
260; GFX9-LABEL: load_lds_v3i32_align2:
261; GFX9:       ; %bb.0:
262; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263; GFX9-NEXT:    ds_read_u16 v1, v0
264; GFX9-NEXT:    ds_read_u16 v2, v0 offset:2
265; GFX9-NEXT:    ds_read_u16 v3, v0 offset:4
266; GFX9-NEXT:    ds_read_u16 v4, v0 offset:6
267; GFX9-NEXT:    ds_read_u16 v5, v0 offset:8
268; GFX9-NEXT:    ds_read_u16 v6, v0 offset:10
269; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
270; GFX9-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
271; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
272; GFX9-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX9-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
275; GFX9-NEXT:    s_setpc_b64 s[30:31]
276;
277; GFX7-LABEL: load_lds_v3i32_align2:
278; GFX7:       ; %bb.0:
279; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280; GFX7-NEXT:    s_mov_b32 m0, -1
281; GFX7-NEXT:    ds_read_u16 v2, v0 offset:8
282; GFX7-NEXT:    ds_read_u16 v1, v0 offset:4
283; GFX7-NEXT:    ds_read_u16 v3, v0 offset:2
284; GFX7-NEXT:    ds_read_u16 v4, v0
285; GFX7-NEXT:    ds_read_u16 v5, v0 offset:6
286; GFX7-NEXT:    ds_read_u16 v6, v0 offset:10
287; GFX7-NEXT:    s_waitcnt lgkmcnt(3)
288; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
289; GFX7-NEXT:    s_waitcnt lgkmcnt(2)
290; GFX7-NEXT:    v_or_b32_e32 v0, v0, v4
291; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
292; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
293; GFX7-NEXT:    v_or_b32_e32 v1, v3, v1
294; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
296; GFX7-NEXT:    v_or_b32_e32 v2, v3, v2
297; GFX7-NEXT:    s_setpc_b64 s[30:31]
298;
299; GFX6-LABEL: load_lds_v3i32_align2:
300; GFX6:       ; %bb.0:
301; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
302; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 6, v0
303; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 4, v0
304; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 10, v0
305; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 8, v0
306; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 2, v0
307; GFX6-NEXT:    s_mov_b32 m0, -1
308; GFX6-NEXT:    ds_read_u16 v1, v1
309; GFX6-NEXT:    ds_read_u16 v2, v2
310; GFX6-NEXT:    ds_read_u16 v3, v3
311; GFX6-NEXT:    ds_read_u16 v4, v4
312; GFX6-NEXT:    ds_read_u16 v5, v5
313; GFX6-NEXT:    ds_read_u16 v0, v0
314; GFX6-NEXT:    s_waitcnt lgkmcnt(5)
315; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
316; GFX6-NEXT:    s_waitcnt lgkmcnt(4)
317; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
318; GFX6-NEXT:    s_waitcnt lgkmcnt(3)
319; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
320; GFX6-NEXT:    s_waitcnt lgkmcnt(1)
321; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
322; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
323; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
324; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
325; GFX6-NEXT:    s_setpc_b64 s[30:31]
326;
327; GFX10-LABEL: load_lds_v3i32_align2:
328; GFX10:       ; %bb.0:
329; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
330; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
331; GFX10-NEXT:    ds_read_u16 v1, v0
332; GFX10-NEXT:    ds_read_u16 v2, v0 offset:2
333; GFX10-NEXT:    ds_read_u16 v3, v0 offset:4
334; GFX10-NEXT:    ds_read_u16 v4, v0 offset:6
335; GFX10-NEXT:    ds_read_u16 v5, v0 offset:8
336; GFX10-NEXT:    ds_read_u16 v6, v0 offset:10
337; GFX10-NEXT:    s_waitcnt lgkmcnt(4)
338; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
339; GFX10-NEXT:    s_waitcnt lgkmcnt(2)
340; GFX10-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
341; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
342; GFX10-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
343; GFX10-NEXT:    s_setpc_b64 s[30:31]
344;
345; GFX11-LABEL: load_lds_v3i32_align2:
346; GFX11:       ; %bb.0:
347; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
348; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
349; GFX11-NEXT:    ds_load_u16 v1, v0
350; GFX11-NEXT:    ds_load_u16 v2, v0 offset:2
351; GFX11-NEXT:    ds_load_u16 v3, v0 offset:4
352; GFX11-NEXT:    ds_load_u16 v4, v0 offset:6
353; GFX11-NEXT:    ds_load_u16 v5, v0 offset:8
354; GFX11-NEXT:    ds_load_u16 v6, v0 offset:10
355; GFX11-NEXT:    s_waitcnt lgkmcnt(4)
356; GFX11-NEXT:    v_lshl_or_b32 v0, v2, 16, v1
357; GFX11-NEXT:    s_waitcnt lgkmcnt(2)
358; GFX11-NEXT:    v_lshl_or_b32 v1, v4, 16, v3
359; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
360; GFX11-NEXT:    v_lshl_or_b32 v2, v6, 16, v5
361; GFX11-NEXT:    s_setpc_b64 s[30:31]
362  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 2
363  ret <3 x i32> %load
364}
365
366define <3 x i32> @load_lds_v3i32_align4(<3 x i32> addrspace(3)* %ptr) {
367; GFX9-LABEL: load_lds_v3i32_align4:
368; GFX9:       ; %bb.0:
369; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
370; GFX9-NEXT:    v_mov_b32_e32 v2, v0
371; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
372; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
373; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
374; GFX9-NEXT:    s_setpc_b64 s[30:31]
375;
376; GFX7-LABEL: load_lds_v3i32_align4:
377; GFX7:       ; %bb.0:
378; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379; GFX7-NEXT:    v_mov_b32_e32 v2, v0
380; GFX7-NEXT:    s_mov_b32 m0, -1
381; GFX7-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
382; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
383; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX7-NEXT:    s_setpc_b64 s[30:31]
385;
386; GFX6-LABEL: load_lds_v3i32_align4:
387; GFX6:       ; %bb.0:
388; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
390; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v0
391; GFX6-NEXT:    s_mov_b32 m0, -1
392; GFX6-NEXT:    ds_read_b32 v2, v2
393; GFX6-NEXT:    ds_read_b32 v0, v0
394; GFX6-NEXT:    ds_read_b32 v1, v1
395; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX6-NEXT:    s_setpc_b64 s[30:31]
397;
398; GFX10-LABEL: load_lds_v3i32_align4:
399; GFX10:       ; %bb.0:
400; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
401; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
402; GFX10-NEXT:    v_mov_b32_e32 v2, v0
403; GFX10-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
404; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
405; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
406; GFX10-NEXT:    s_setpc_b64 s[30:31]
407;
408; GFX11-LABEL: load_lds_v3i32_align4:
409; GFX11:       ; %bb.0:
410; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
411; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
412; GFX11-NEXT:    v_mov_b32_e32 v2, v0
413; GFX11-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
414; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
415; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX11-NEXT:    s_setpc_b64 s[30:31]
417  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
418  ret <3 x i32> %load
419}
420
421define <3 x i32> @load_lds_v3i32_align8(<3 x i32> addrspace(3)* %ptr) {
422; GFX9-LABEL: load_lds_v3i32_align8:
423; GFX9:       ; %bb.0:
424; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
425; GFX9-NEXT:    v_mov_b32_e32 v2, v0
426; GFX9-NEXT:    ds_read_b64 v[0:1], v0
427; GFX9-NEXT:    ds_read_b32 v2, v2 offset:8
428; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX9-NEXT:    s_setpc_b64 s[30:31]
430;
431; GFX7-LABEL: load_lds_v3i32_align8:
432; GFX7:       ; %bb.0:
433; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
434; GFX7-NEXT:    v_mov_b32_e32 v2, v0
435; GFX7-NEXT:    s_mov_b32 m0, -1
436; GFX7-NEXT:    ds_read_b64 v[0:1], v0
437; GFX7-NEXT:    ds_read_b32 v2, v2 offset:8
438; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
439; GFX7-NEXT:    s_setpc_b64 s[30:31]
440;
441; GFX6-LABEL: load_lds_v3i32_align8:
442; GFX6:       ; %bb.0:
443; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
444; GFX6-NEXT:    v_mov_b32_e32 v2, v0
445; GFX6-NEXT:    s_mov_b32 m0, -1
446; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
447; GFX6-NEXT:    ds_read_b64 v[0:1], v0
448; GFX6-NEXT:    ds_read_b32 v2, v2
449; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
450; GFX6-NEXT:    s_setpc_b64 s[30:31]
451;
452; GFX10-LABEL: load_lds_v3i32_align8:
453; GFX10:       ; %bb.0:
454; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
455; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
456; GFX10-NEXT:    v_mov_b32_e32 v2, v0
457; GFX10-NEXT:    ds_read_b64 v[0:1], v0
458; GFX10-NEXT:    ds_read_b32 v2, v2 offset:8
459; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX10-NEXT:    s_setpc_b64 s[30:31]
461;
462; GFX11-LABEL: load_lds_v3i32_align8:
463; GFX11:       ; %bb.0:
464; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
465; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
466; GFX11-NEXT:    v_mov_b32_e32 v2, v0
467; GFX11-NEXT:    ds_load_b64 v[0:1], v0
468; GFX11-NEXT:    ds_load_b32 v2, v2 offset:8
469; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX11-NEXT:    s_setpc_b64 s[30:31]
471  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 8
472  ret <3 x i32> %load
473}
474
475define <3 x i32> @load_lds_v3i32_align16(<3 x i32> addrspace(3)* %ptr) {
476; GFX9-LABEL: load_lds_v3i32_align16:
477; GFX9:       ; %bb.0:
478; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
479; GFX9-NEXT:    ds_read_b96 v[0:2], v0
480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX9-NEXT:    s_setpc_b64 s[30:31]
482;
483; GFX7-LABEL: load_lds_v3i32_align16:
484; GFX7:       ; %bb.0:
485; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
486; GFX7-NEXT:    s_mov_b32 m0, -1
487; GFX7-NEXT:    ds_read_b96 v[0:2], v0
488; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX7-NEXT:    s_setpc_b64 s[30:31]
490;
491; GFX6-LABEL: load_lds_v3i32_align16:
492; GFX6:       ; %bb.0:
493; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
494; GFX6-NEXT:    v_mov_b32_e32 v2, v0
495; GFX6-NEXT:    s_mov_b32 m0, -1
496; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
497; GFX6-NEXT:    ds_read_b64 v[0:1], v0
498; GFX6-NEXT:    ds_read_b32 v2, v2
499; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
500; GFX6-NEXT:    s_setpc_b64 s[30:31]
501;
502; GFX10-LABEL: load_lds_v3i32_align16:
503; GFX10:       ; %bb.0:
504; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
505; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
506; GFX10-NEXT:    ds_read_b96 v[0:2], v0
507; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX10-NEXT:    s_setpc_b64 s[30:31]
509;
510; GFX11-LABEL: load_lds_v3i32_align16:
511; GFX11:       ; %bb.0:
512; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
513; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
514; GFX11-NEXT:    ds_load_b96 v[0:2], v0
515; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX11-NEXT:    s_setpc_b64 s[30:31]
517  %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
518  ret <3 x i32> %load
519}
520