1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -verify-machineinstrs < %s | FileCheck --check-prefix=GFX7 %s
4; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefix=GFX6 %s
5; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
7
8define amdgpu_kernel void @store_lds_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
9; GFX9-LABEL: store_lds_v3i32:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
12; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    v_mov_b32_e32 v0, s4
15; GFX9-NEXT:    v_mov_b32_e32 v1, s5
16; GFX9-NEXT:    v_mov_b32_e32 v2, s6
17; GFX9-NEXT:    v_mov_b32_e32 v3, s2
18; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
19; GFX9-NEXT:    s_endpgm
20;
21; GFX7-LABEL: store_lds_v3i32:
22; GFX7:       ; %bb.0:
23; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4
24; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
25; GFX7-NEXT:    s_mov_b32 m0, -1
26; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
27; GFX7-NEXT:    v_mov_b32_e32 v0, s4
28; GFX7-NEXT:    v_mov_b32_e32 v1, s5
29; GFX7-NEXT:    v_mov_b32_e32 v2, s6
30; GFX7-NEXT:    v_mov_b32_e32 v3, s0
31; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
32; GFX7-NEXT:    s_endpgm
33;
34; GFX6-LABEL: store_lds_v3i32:
35; GFX6:       ; %bb.0:
36; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
37; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
38; GFX6-NEXT:    s_mov_b32 m0, -1
39; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX6-NEXT:    v_mov_b32_e32 v2, s4
41; GFX6-NEXT:    v_mov_b32_e32 v1, s2
42; GFX6-NEXT:    v_mov_b32_e32 v0, s0
43; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
44; GFX6-NEXT:    v_mov_b32_e32 v1, s1
45; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
46; GFX6-NEXT:    s_endpgm
47;
48; GFX10-LABEL: store_lds_v3i32:
49; GFX10:       ; %bb.0:
50; GFX10-NEXT:    s_clause 0x1
51; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
52; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
53; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
54; GFX10-NEXT:    v_mov_b32_e32 v0, s4
55; GFX10-NEXT:    v_mov_b32_e32 v1, s5
56; GFX10-NEXT:    v_mov_b32_e32 v2, s6
57; GFX10-NEXT:    v_mov_b32_e32 v3, s2
58; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
59; GFX10-NEXT:    s_endpgm
60;
61; GFX11-LABEL: store_lds_v3i32:
62; GFX11:       ; %bb.0:
63; GFX11-NEXT:    s_clause 0x1
64; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x10
65; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
66; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
68; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
69; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
70; GFX11-NEXT:    s_endpgm
71  store <3 x i32> %x, <3 x i32> addrspace(3)* %out
72  ret void
73}
74
75define amdgpu_kernel void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
76; GFX9-LABEL: store_lds_v3i32_align1:
77; GFX9:       ; %bb.0:
78; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
79; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
80; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX9-NEXT:    v_mov_b32_e32 v0, s2
82; GFX9-NEXT:    v_mov_b32_e32 v1, s6
83; GFX9-NEXT:    v_mov_b32_e32 v2, s4
84; GFX9-NEXT:    ds_write_b8 v0, v1 offset:8
85; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
86; GFX9-NEXT:    ds_write_b8 v0, v2
87; GFX9-NEXT:    ds_write_b8_d16_hi v0, v2 offset:2
88; GFX9-NEXT:    v_mov_b32_e32 v1, s5
89; GFX9-NEXT:    s_lshr_b32 s0, s6, 8
90; GFX9-NEXT:    ds_write_b8 v0, v1 offset:4
91; GFX9-NEXT:    ds_write_b8_d16_hi v0, v1 offset:6
92; GFX9-NEXT:    v_mov_b32_e32 v1, s0
93; GFX9-NEXT:    s_lshr_b32 s0, s6, 24
94; GFX9-NEXT:    ds_write_b8 v0, v1 offset:9
95; GFX9-NEXT:    v_mov_b32_e32 v1, s0
96; GFX9-NEXT:    s_lshr_b32 s0, s4, 8
97; GFX9-NEXT:    ds_write_b8 v0, v1 offset:11
98; GFX9-NEXT:    v_mov_b32_e32 v1, s0
99; GFX9-NEXT:    s_lshr_b32 s0, s4, 24
100; GFX9-NEXT:    ds_write_b8 v0, v1 offset:1
101; GFX9-NEXT:    v_mov_b32_e32 v1, s0
102; GFX9-NEXT:    s_lshr_b32 s0, s5, 8
103; GFX9-NEXT:    ds_write_b8 v0, v1 offset:3
104; GFX9-NEXT:    v_mov_b32_e32 v1, s0
105; GFX9-NEXT:    s_lshr_b32 s0, s5, 24
106; GFX9-NEXT:    ds_write_b8 v0, v1 offset:5
107; GFX9-NEXT:    v_mov_b32_e32 v1, s0
108; GFX9-NEXT:    ds_write_b8 v0, v1 offset:7
109; GFX9-NEXT:    s_endpgm
110;
111; GFX7-LABEL: store_lds_v3i32_align1:
112; GFX7:       ; %bb.0:
113; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
114; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
115; GFX7-NEXT:    s_mov_b32 m0, -1
116; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX7-NEXT:    v_mov_b32_e32 v0, s4
118; GFX7-NEXT:    v_mov_b32_e32 v1, s2
119; GFX7-NEXT:    v_mov_b32_e32 v2, s0
120; GFX7-NEXT:    ds_write_b8 v0, v1 offset:8
121; GFX7-NEXT:    ds_write_b8 v0, v2
122; GFX7-NEXT:    v_mov_b32_e32 v1, s1
123; GFX7-NEXT:    s_lshr_b32 s3, s2, 8
124; GFX7-NEXT:    ds_write_b8 v0, v1 offset:4
125; GFX7-NEXT:    v_mov_b32_e32 v1, s3
126; GFX7-NEXT:    s_lshr_b32 s3, s2, 24
127; GFX7-NEXT:    ds_write_b8 v0, v1 offset:9
128; GFX7-NEXT:    v_mov_b32_e32 v1, s3
129; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
130; GFX7-NEXT:    ds_write_b8 v0, v1 offset:11
131; GFX7-NEXT:    v_mov_b32_e32 v1, s2
132; GFX7-NEXT:    s_lshr_b32 s2, s0, 8
133; GFX7-NEXT:    ds_write_b8 v0, v1 offset:10
134; GFX7-NEXT:    v_mov_b32_e32 v1, s2
135; GFX7-NEXT:    s_lshr_b32 s2, s0, 24
136; GFX7-NEXT:    ds_write_b8 v0, v1 offset:1
137; GFX7-NEXT:    v_mov_b32_e32 v1, s2
138; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
139; GFX7-NEXT:    ds_write_b8 v0, v1 offset:3
140; GFX7-NEXT:    v_mov_b32_e32 v1, s0
141; GFX7-NEXT:    s_lshr_b32 s0, s1, 8
142; GFX7-NEXT:    ds_write_b8 v0, v1 offset:2
143; GFX7-NEXT:    v_mov_b32_e32 v1, s0
144; GFX7-NEXT:    s_lshr_b32 s0, s1, 24
145; GFX7-NEXT:    ds_write_b8 v0, v1 offset:5
146; GFX7-NEXT:    v_mov_b32_e32 v1, s0
147; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
148; GFX7-NEXT:    ds_write_b8 v0, v1 offset:7
149; GFX7-NEXT:    v_mov_b32_e32 v1, s0
150; GFX7-NEXT:    ds_write_b8 v0, v1 offset:6
151; GFX7-NEXT:    s_endpgm
152;
153; GFX6-LABEL: store_lds_v3i32_align1:
154; GFX6:       ; %bb.0:
155; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
156; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
157; GFX6-NEXT:    s_mov_b32 m0, -1
158; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX6-NEXT:    v_mov_b32_e32 v0, s4
160; GFX6-NEXT:    v_mov_b32_e32 v1, s2
161; GFX6-NEXT:    v_mov_b32_e32 v2, s0
162; GFX6-NEXT:    ds_write_b8 v0, v1 offset:8
163; GFX6-NEXT:    ds_write_b8 v0, v2
164; GFX6-NEXT:    v_mov_b32_e32 v1, s1
165; GFX6-NEXT:    s_lshr_b32 s3, s2, 8
166; GFX6-NEXT:    ds_write_b8 v0, v1 offset:4
167; GFX6-NEXT:    v_mov_b32_e32 v1, s3
168; GFX6-NEXT:    s_lshr_b32 s3, s2, 24
169; GFX6-NEXT:    ds_write_b8 v0, v1 offset:9
170; GFX6-NEXT:    v_mov_b32_e32 v1, s3
171; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
172; GFX6-NEXT:    ds_write_b8 v0, v1 offset:11
173; GFX6-NEXT:    v_mov_b32_e32 v1, s2
174; GFX6-NEXT:    s_lshr_b32 s2, s0, 8
175; GFX6-NEXT:    ds_write_b8 v0, v1 offset:10
176; GFX6-NEXT:    v_mov_b32_e32 v1, s2
177; GFX6-NEXT:    s_lshr_b32 s2, s0, 24
178; GFX6-NEXT:    ds_write_b8 v0, v1 offset:1
179; GFX6-NEXT:    v_mov_b32_e32 v1, s2
180; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
181; GFX6-NEXT:    ds_write_b8 v0, v1 offset:3
182; GFX6-NEXT:    v_mov_b32_e32 v1, s0
183; GFX6-NEXT:    s_lshr_b32 s0, s1, 8
184; GFX6-NEXT:    ds_write_b8 v0, v1 offset:2
185; GFX6-NEXT:    v_mov_b32_e32 v1, s0
186; GFX6-NEXT:    s_lshr_b32 s0, s1, 24
187; GFX6-NEXT:    ds_write_b8 v0, v1 offset:5
188; GFX6-NEXT:    v_mov_b32_e32 v1, s0
189; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
190; GFX6-NEXT:    ds_write_b8 v0, v1 offset:7
191; GFX6-NEXT:    v_mov_b32_e32 v1, s0
192; GFX6-NEXT:    ds_write_b8 v0, v1 offset:6
193; GFX6-NEXT:    s_endpgm
194;
195; GFX10-LABEL: store_lds_v3i32_align1:
196; GFX10:       ; %bb.0:
197; GFX10-NEXT:    s_clause 0x1
198; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
199; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
200; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
201; GFX10-NEXT:    v_mov_b32_e32 v0, s2
202; GFX10-NEXT:    v_mov_b32_e32 v1, s6
203; GFX10-NEXT:    v_mov_b32_e32 v2, s4
204; GFX10-NEXT:    v_mov_b32_e32 v3, s5
205; GFX10-NEXT:    s_lshr_b32 s0, s6, 8
206; GFX10-NEXT:    s_lshr_b32 s1, s6, 24
207; GFX10-NEXT:    s_lshr_b32 s2, s4, 8
208; GFX10-NEXT:    s_lshr_b32 s3, s4, 24
209; GFX10-NEXT:    s_lshr_b32 s4, s5, 8
210; GFX10-NEXT:    s_lshr_b32 s5, s5, 24
211; GFX10-NEXT:    v_mov_b32_e32 v4, s0
212; GFX10-NEXT:    v_mov_b32_e32 v5, s1
213; GFX10-NEXT:    v_mov_b32_e32 v6, s2
214; GFX10-NEXT:    v_mov_b32_e32 v7, s3
215; GFX10-NEXT:    v_mov_b32_e32 v8, s4
216; GFX10-NEXT:    v_mov_b32_e32 v9, s5
217; GFX10-NEXT:    ds_write_b8 v0, v1 offset:8
218; GFX10-NEXT:    ds_write_b8_d16_hi v0, v1 offset:10
219; GFX10-NEXT:    ds_write_b8 v0, v2
220; GFX10-NEXT:    ds_write_b8_d16_hi v0, v2 offset:2
221; GFX10-NEXT:    ds_write_b8 v0, v3 offset:4
222; GFX10-NEXT:    ds_write_b8_d16_hi v0, v3 offset:6
223; GFX10-NEXT:    ds_write_b8 v0, v4 offset:9
224; GFX10-NEXT:    ds_write_b8 v0, v5 offset:11
225; GFX10-NEXT:    ds_write_b8 v0, v6 offset:1
226; GFX10-NEXT:    ds_write_b8 v0, v7 offset:3
227; GFX10-NEXT:    ds_write_b8 v0, v8 offset:5
228; GFX10-NEXT:    ds_write_b8 v0, v9 offset:7
229; GFX10-NEXT:    s_endpgm
230;
231; GFX11-LABEL: store_lds_v3i32_align1:
232; GFX11:       ; %bb.0:
233; GFX11-NEXT:    s_clause 0x1
234; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
235; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x10
236; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
238; GFX11-NEXT:    s_lshr_b32 s3, s2, 8
239; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
240; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
241; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
242; GFX11-NEXT:    v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v5, s2
243; GFX11-NEXT:    s_lshr_b32 s4, s0, 8
244; GFX11-NEXT:    s_lshr_b32 s0, s0, 24
245; GFX11-NEXT:    s_lshr_b32 s5, s1, 8
246; GFX11-NEXT:    s_lshr_b32 s1, s1, 24
247; GFX11-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s0
248; GFX11-NEXT:    v_dual_mov_b32 v8, s5 :: v_dual_mov_b32 v9, s1
249; GFX11-NEXT:    ds_store_b8 v0, v1 offset:8
250; GFX11-NEXT:    ds_store_b8 v0, v2
251; GFX11-NEXT:    ds_store_b8 v0, v4 offset:9
252; GFX11-NEXT:    ds_store_b8_d16_hi v0, v1 offset:10
253; GFX11-NEXT:    ds_store_b8 v0, v5 offset:11
254; GFX11-NEXT:    ds_store_b8_d16_hi v0, v2 offset:2
255; GFX11-NEXT:    ds_store_b8 v0, v6 offset:1
256; GFX11-NEXT:    ds_store_b8 v0, v3 offset:4
257; GFX11-NEXT:    ds_store_b8 v0, v7 offset:3
258; GFX11-NEXT:    ds_store_b8 v0, v8 offset:5
259; GFX11-NEXT:    ds_store_b8_d16_hi v0, v3 offset:6
260; GFX11-NEXT:    ds_store_b8 v0, v9 offset:7
261; GFX11-NEXT:    s_endpgm
262  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
263  ret void
264}
265
266define amdgpu_kernel void @store_lds_v3i32_align2(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
267; GFX9-LABEL: store_lds_v3i32_align2:
268; GFX9:       ; %bb.0:
269; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
270; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
271; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX9-NEXT:    v_mov_b32_e32 v0, s2
273; GFX9-NEXT:    v_mov_b32_e32 v1, s6
274; GFX9-NEXT:    v_mov_b32_e32 v2, s4
275; GFX9-NEXT:    ds_write_b16 v0, v1 offset:8
276; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
277; GFX9-NEXT:    ds_write_b16 v0, v2
278; GFX9-NEXT:    ds_write_b16_d16_hi v0, v2 offset:2
279; GFX9-NEXT:    v_mov_b32_e32 v1, s5
280; GFX9-NEXT:    ds_write_b16 v0, v1 offset:4
281; GFX9-NEXT:    ds_write_b16_d16_hi v0, v1 offset:6
282; GFX9-NEXT:    s_endpgm
283;
284; GFX7-LABEL: store_lds_v3i32_align2:
285; GFX7:       ; %bb.0:
286; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
287; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
288; GFX7-NEXT:    s_mov_b32 m0, -1
289; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
290; GFX7-NEXT:    v_mov_b32_e32 v0, s4
291; GFX7-NEXT:    v_mov_b32_e32 v1, s2
292; GFX7-NEXT:    v_mov_b32_e32 v2, s0
293; GFX7-NEXT:    ds_write_b16 v0, v1 offset:8
294; GFX7-NEXT:    ds_write_b16 v0, v2
295; GFX7-NEXT:    v_mov_b32_e32 v1, s1
296; GFX7-NEXT:    s_lshr_b32 s2, s2, 16
297; GFX7-NEXT:    ds_write_b16 v0, v1 offset:4
298; GFX7-NEXT:    v_mov_b32_e32 v1, s2
299; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
300; GFX7-NEXT:    ds_write_b16 v0, v1 offset:10
301; GFX7-NEXT:    v_mov_b32_e32 v1, s0
302; GFX7-NEXT:    s_lshr_b32 s0, s1, 16
303; GFX7-NEXT:    ds_write_b16 v0, v1 offset:2
304; GFX7-NEXT:    v_mov_b32_e32 v1, s0
305; GFX7-NEXT:    ds_write_b16 v0, v1 offset:6
306; GFX7-NEXT:    s_endpgm
307;
308; GFX6-LABEL: store_lds_v3i32_align2:
309; GFX6:       ; %bb.0:
310; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
311; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
312; GFX6-NEXT:    s_mov_b32 m0, -1
313; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX6-NEXT:    v_mov_b32_e32 v0, s4
315; GFX6-NEXT:    v_mov_b32_e32 v1, s2
316; GFX6-NEXT:    v_mov_b32_e32 v2, s0
317; GFX6-NEXT:    ds_write_b16 v0, v1 offset:8
318; GFX6-NEXT:    ds_write_b16 v0, v2
319; GFX6-NEXT:    v_mov_b32_e32 v1, s1
320; GFX6-NEXT:    s_lshr_b32 s2, s2, 16
321; GFX6-NEXT:    ds_write_b16 v0, v1 offset:4
322; GFX6-NEXT:    v_mov_b32_e32 v1, s2
323; GFX6-NEXT:    s_lshr_b32 s0, s0, 16
324; GFX6-NEXT:    ds_write_b16 v0, v1 offset:10
325; GFX6-NEXT:    v_mov_b32_e32 v1, s0
326; GFX6-NEXT:    s_lshr_b32 s0, s1, 16
327; GFX6-NEXT:    ds_write_b16 v0, v1 offset:2
328; GFX6-NEXT:    v_mov_b32_e32 v1, s0
329; GFX6-NEXT:    ds_write_b16 v0, v1 offset:6
330; GFX6-NEXT:    s_endpgm
331;
332; GFX10-LABEL: store_lds_v3i32_align2:
333; GFX10:       ; %bb.0:
334; GFX10-NEXT:    s_clause 0x1
335; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
336; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
337; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX10-NEXT:    v_mov_b32_e32 v0, s2
339; GFX10-NEXT:    v_mov_b32_e32 v1, s6
340; GFX10-NEXT:    v_mov_b32_e32 v2, s4
341; GFX10-NEXT:    v_mov_b32_e32 v3, s5
342; GFX10-NEXT:    ds_write_b16 v0, v1 offset:8
343; GFX10-NEXT:    ds_write_b16_d16_hi v0, v1 offset:10
344; GFX10-NEXT:    ds_write_b16 v0, v2
345; GFX10-NEXT:    ds_write_b16_d16_hi v0, v2 offset:2
346; GFX10-NEXT:    ds_write_b16 v0, v3 offset:4
347; GFX10-NEXT:    ds_write_b16_d16_hi v0, v3 offset:6
348; GFX10-NEXT:    s_endpgm
349;
350; GFX11-LABEL: store_lds_v3i32_align2:
351; GFX11:       ; %bb.0:
352; GFX11-NEXT:    s_clause 0x1
353; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
354; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x10
355; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s2
357; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
358; GFX11-NEXT:    ds_store_b16_d16_hi v0, v1 offset:10
359; GFX11-NEXT:    ds_store_b16 v0, v2
360; GFX11-NEXT:    ds_store_b16_d16_hi v0, v2 offset:2
361; GFX11-NEXT:    ds_store_b16 v0, v3 offset:4
362; GFX11-NEXT:    ds_store_b16 v0, v1 offset:8
363; GFX11-NEXT:    ds_store_b16_d16_hi v0, v3 offset:6
364; GFX11-NEXT:    s_endpgm
365  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 2
366  ret void
367}
368
369define amdgpu_kernel void @store_lds_v3i32_align4(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
370; GFX9-LABEL: store_lds_v3i32_align4:
371; GFX9:       ; %bb.0:
372; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
373; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
374; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX9-NEXT:    v_mov_b32_e32 v0, s2
376; GFX9-NEXT:    v_mov_b32_e32 v1, s4
377; GFX9-NEXT:    v_mov_b32_e32 v2, s5
378; GFX9-NEXT:    v_mov_b32_e32 v3, s6
379; GFX9-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
380; GFX9-NEXT:    ds_write_b32 v0, v3 offset:8
381; GFX9-NEXT:    s_endpgm
382;
383; GFX7-LABEL: store_lds_v3i32_align4:
384; GFX7:       ; %bb.0:
385; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
386; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
387; GFX7-NEXT:    s_mov_b32 m0, -1
388; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
389; GFX7-NEXT:    v_mov_b32_e32 v0, s4
390; GFX7-NEXT:    v_mov_b32_e32 v1, s0
391; GFX7-NEXT:    v_mov_b32_e32 v2, s1
392; GFX7-NEXT:    ds_write2_b32 v0, v1, v2 offset1:1
393; GFX7-NEXT:    v_mov_b32_e32 v1, s2
394; GFX7-NEXT:    ds_write_b32 v0, v1 offset:8
395; GFX7-NEXT:    s_endpgm
396;
397; GFX6-LABEL: store_lds_v3i32_align4:
398; GFX6:       ; %bb.0:
399; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
400; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
401; GFX6-NEXT:    s_mov_b32 m0, -1
402; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
403; GFX6-NEXT:    v_mov_b32_e32 v0, s4
404; GFX6-NEXT:    v_mov_b32_e32 v1, s2
405; GFX6-NEXT:    v_mov_b32_e32 v2, s0
406; GFX6-NEXT:    ds_write_b32 v0, v1 offset:8
407; GFX6-NEXT:    v_mov_b32_e32 v1, s1
408; GFX6-NEXT:    ds_write2_b32 v0, v2, v1 offset1:1
409; GFX6-NEXT:    s_endpgm
410;
411; GFX10-LABEL: store_lds_v3i32_align4:
412; GFX10:       ; %bb.0:
413; GFX10-NEXT:    s_clause 0x1
414; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
415; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
416; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
417; GFX10-NEXT:    v_mov_b32_e32 v0, s2
418; GFX10-NEXT:    v_mov_b32_e32 v1, s6
419; GFX10-NEXT:    v_mov_b32_e32 v2, s4
420; GFX10-NEXT:    v_mov_b32_e32 v3, s5
421; GFX10-NEXT:    ds_write_b32 v0, v1 offset:8
422; GFX10-NEXT:    ds_write2_b32 v0, v2, v3 offset1:1
423; GFX10-NEXT:    s_endpgm
424;
425; GFX11-LABEL: store_lds_v3i32_align4:
426; GFX11:       ; %bb.0:
427; GFX11-NEXT:    s_clause 0x1
428; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
429; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x10
430; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
431; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s0
432; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s2
433; GFX11-NEXT:    ds_store_2addr_b32 v0, v1, v2 offset1:1
434; GFX11-NEXT:    ds_store_b32 v0, v3 offset:8
435; GFX11-NEXT:    s_endpgm
436  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 4
437  ret void
438}
439
440define amdgpu_kernel void @store_lds_v3i32_align8(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
441; GFX9-LABEL: store_lds_v3i32_align8:
442; GFX9:       ; %bb.0:
443; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
444; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
445; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX9-NEXT:    v_mov_b32_e32 v2, s2
447; GFX9-NEXT:    v_mov_b32_e32 v3, s6
448; GFX9-NEXT:    v_mov_b32_e32 v0, s4
449; GFX9-NEXT:    v_mov_b32_e32 v1, s5
450; GFX9-NEXT:    ds_write_b32 v2, v3 offset:8
451; GFX9-NEXT:    ds_write_b64 v2, v[0:1]
452; GFX9-NEXT:    s_endpgm
453;
454; GFX7-LABEL: store_lds_v3i32_align8:
455; GFX7:       ; %bb.0:
456; GFX7-NEXT:    s_load_dword s4, s[0:1], 0x0
457; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
458; GFX7-NEXT:    s_mov_b32 m0, -1
459; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX7-NEXT:    v_mov_b32_e32 v2, s4
461; GFX7-NEXT:    v_mov_b32_e32 v1, s2
462; GFX7-NEXT:    v_mov_b32_e32 v0, s0
463; GFX7-NEXT:    ds_write_b32 v2, v1 offset:8
464; GFX7-NEXT:    v_mov_b32_e32 v1, s1
465; GFX7-NEXT:    ds_write_b64 v2, v[0:1]
466; GFX7-NEXT:    s_endpgm
467;
468; GFX6-LABEL: store_lds_v3i32_align8:
469; GFX6:       ; %bb.0:
470; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
471; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
472; GFX6-NEXT:    s_mov_b32 m0, -1
473; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
474; GFX6-NEXT:    v_mov_b32_e32 v2, s4
475; GFX6-NEXT:    v_mov_b32_e32 v1, s2
476; GFX6-NEXT:    v_mov_b32_e32 v0, s0
477; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
478; GFX6-NEXT:    v_mov_b32_e32 v1, s1
479; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
480; GFX6-NEXT:    s_endpgm
481;
482; GFX10-LABEL: store_lds_v3i32_align8:
483; GFX10:       ; %bb.0:
484; GFX10-NEXT:    s_clause 0x1
485; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
486; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
487; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
488; GFX10-NEXT:    v_mov_b32_e32 v2, s2
489; GFX10-NEXT:    v_mov_b32_e32 v3, s6
490; GFX10-NEXT:    v_mov_b32_e32 v0, s4
491; GFX10-NEXT:    v_mov_b32_e32 v1, s5
492; GFX10-NEXT:    ds_write_b32 v2, v3 offset:8
493; GFX10-NEXT:    ds_write_b64 v2, v[0:1]
494; GFX10-NEXT:    s_endpgm
495;
496; GFX11-LABEL: store_lds_v3i32_align8:
497; GFX11:       ; %bb.0:
498; GFX11-NEXT:    s_clause 0x1
499; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x0
500; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x10
501; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s2
503; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
504; GFX11-NEXT:    ds_store_b32 v2, v3 offset:8
505; GFX11-NEXT:    ds_store_b64 v2, v[0:1]
506; GFX11-NEXT:    s_endpgm
507  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 8
508  ret void
509}
510
511define amdgpu_kernel void @store_lds_v3i32_align16(<3 x i32> addrspace(3)* %out, <3 x i32> %x) {
512; GFX9-LABEL: store_lds_v3i32_align16:
513; GFX9:       ; %bb.0:
514; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
515; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x0
516; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX9-NEXT:    v_mov_b32_e32 v0, s4
518; GFX9-NEXT:    v_mov_b32_e32 v1, s5
519; GFX9-NEXT:    v_mov_b32_e32 v2, s6
520; GFX9-NEXT:    v_mov_b32_e32 v3, s2
521; GFX9-NEXT:    ds_write_b96 v3, v[0:2]
522; GFX9-NEXT:    s_endpgm
523;
524; GFX7-LABEL: store_lds_v3i32_align16:
525; GFX7:       ; %bb.0:
526; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x4
527; GFX7-NEXT:    s_load_dword s0, s[0:1], 0x0
528; GFX7-NEXT:    s_mov_b32 m0, -1
529; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
530; GFX7-NEXT:    v_mov_b32_e32 v0, s4
531; GFX7-NEXT:    v_mov_b32_e32 v1, s5
532; GFX7-NEXT:    v_mov_b32_e32 v2, s6
533; GFX7-NEXT:    v_mov_b32_e32 v3, s0
534; GFX7-NEXT:    ds_write_b96 v3, v[0:2]
535; GFX7-NEXT:    s_endpgm
536;
537; GFX6-LABEL: store_lds_v3i32_align16:
538; GFX6:       ; %bb.0:
539; GFX6-NEXT:    s_load_dword s4, s[0:1], 0x0
540; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x4
541; GFX6-NEXT:    s_mov_b32 m0, -1
542; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX6-NEXT:    v_mov_b32_e32 v2, s4
544; GFX6-NEXT:    v_mov_b32_e32 v1, s2
545; GFX6-NEXT:    v_mov_b32_e32 v0, s0
546; GFX6-NEXT:    ds_write_b32 v2, v1 offset:8
547; GFX6-NEXT:    v_mov_b32_e32 v1, s1
548; GFX6-NEXT:    ds_write_b64 v2, v[0:1]
549; GFX6-NEXT:    s_endpgm
550;
551; GFX10-LABEL: store_lds_v3i32_align16:
552; GFX10:       ; %bb.0:
553; GFX10-NEXT:    s_clause 0x1
554; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x10
555; GFX10-NEXT:    s_load_dword s2, s[0:1], 0x0
556; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX10-NEXT:    v_mov_b32_e32 v0, s4
558; GFX10-NEXT:    v_mov_b32_e32 v1, s5
559; GFX10-NEXT:    v_mov_b32_e32 v2, s6
560; GFX10-NEXT:    v_mov_b32_e32 v3, s2
561; GFX10-NEXT:    ds_write_b96 v3, v[0:2]
562; GFX10-NEXT:    s_endpgm
563;
564; GFX11-LABEL: store_lds_v3i32_align16:
565; GFX11:       ; %bb.0:
566; GFX11-NEXT:    s_clause 0x1
567; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x10
568; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
569; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
570; GFX11-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
571; GFX11-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s0
572; GFX11-NEXT:    ds_store_b96 v3, v[0:2]
573; GFX11-NEXT:    s_endpgm
574  store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 16
575  ret void
576}
577