1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
9; SI-LABEL: test_copy_v4i8:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s10, 0
14; SI-NEXT:    s_mov_b32 s11, s7
15; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
18; SI-NEXT:    v_mov_b32_e32 v1, 0
19; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
20; SI-NEXT:    s_mov_b32 s6, -1
21; SI-NEXT:    s_mov_b32 s4, s0
22; SI-NEXT:    s_mov_b32 s5, s1
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: test_copy_v4i8:
28; VI:       ; %bb.0:
29; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
30; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
31; VI-NEXT:    s_waitcnt lgkmcnt(0)
32; VI-NEXT:    v_mov_b32_e32 v1, s3
33; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
34; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; VI-NEXT:    flat_load_dword v0, v[0:1]
36; VI-NEXT:    s_mov_b32 s3, 0xf000
37; VI-NEXT:    s_mov_b32 s2, -1
38; VI-NEXT:    s_waitcnt vmcnt(0)
39; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
40; VI-NEXT:    s_endpgm
41  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
42  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
43  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
44  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
45  ret void
46}
47
48define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
49; SI-LABEL: test_copy_v4i8_x2:
50; SI:       ; %bb.0:
51; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
52; SI-NEXT:    s_mov_b32 s3, 0xf000
53; SI-NEXT:    s_mov_b32 s6, 0
54; SI-NEXT:    s_mov_b32 s7, s3
55; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
56; SI-NEXT:    v_mov_b32_e32 v1, 0
57; SI-NEXT:    s_waitcnt lgkmcnt(0)
58; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
59; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
60; SI-NEXT:    s_mov_b32 s2, -1
61; SI-NEXT:    s_mov_b32 s10, s2
62; SI-NEXT:    s_mov_b32 s11, s3
63; SI-NEXT:    s_waitcnt lgkmcnt(0)
64; SI-NEXT:    s_mov_b32 s0, s4
65; SI-NEXT:    s_mov_b32 s1, s5
66; SI-NEXT:    s_mov_b32 s8, s6
67; SI-NEXT:    s_mov_b32 s9, s7
68; SI-NEXT:    s_waitcnt vmcnt(0)
69; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
70; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
71; SI-NEXT:    s_endpgm
72;
73; VI-LABEL: test_copy_v4i8_x2:
74; VI:       ; %bb.0:
75; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
76; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
77; VI-NEXT:    s_mov_b32 s7, 0xf000
78; VI-NEXT:    s_mov_b32 s6, -1
79; VI-NEXT:    s_mov_b32 s10, s6
80; VI-NEXT:    s_waitcnt lgkmcnt(0)
81; VI-NEXT:    v_mov_b32_e32 v1, s3
82; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
83; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
84; VI-NEXT:    flat_load_dword v0, v[0:1]
85; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
86; VI-NEXT:    s_mov_b32 s11, s7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    s_mov_b32 s4, s0
89; VI-NEXT:    s_mov_b32 s5, s1
90; VI-NEXT:    s_mov_b32 s8, s2
91; VI-NEXT:    s_mov_b32 s9, s3
92; VI-NEXT:    s_waitcnt vmcnt(0)
93; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
94; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
95; VI-NEXT:    s_endpgm
96  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
97  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
98  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
99  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
100  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
101  ret void
102}
103
104define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
105; SI-LABEL: test_copy_v4i8_x3:
106; SI:       ; %bb.0:
107; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
108; SI-NEXT:    s_mov_b32 s11, 0xf000
109; SI-NEXT:    s_mov_b32 s14, 0
110; SI-NEXT:    s_mov_b32 s15, s11
111; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
112; SI-NEXT:    s_waitcnt lgkmcnt(0)
113; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
114; SI-NEXT:    v_mov_b32_e32 v1, 0
115; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
116; SI-NEXT:    s_mov_b32 s10, -1
117; SI-NEXT:    s_mov_b32 s8, s0
118; SI-NEXT:    s_mov_b32 s9, s1
119; SI-NEXT:    s_mov_b32 s14, s10
120; SI-NEXT:    s_mov_b32 s6, s10
121; SI-NEXT:    s_mov_b32 s7, s11
122; SI-NEXT:    s_mov_b32 s12, s2
123; SI-NEXT:    s_mov_b32 s13, s3
124; SI-NEXT:    s_waitcnt vmcnt(0)
125; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
126; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
127; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
128; SI-NEXT:    s_endpgm
129;
130; VI-LABEL: test_copy_v4i8_x3:
131; VI:       ; %bb.0:
132; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
133; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
134; VI-NEXT:    s_mov_b32 s11, 0xf000
135; VI-NEXT:    s_mov_b32 s10, -1
136; VI-NEXT:    s_mov_b32 s14, s10
137; VI-NEXT:    s_waitcnt lgkmcnt(0)
138; VI-NEXT:    v_mov_b32_e32 v1, s7
139; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
140; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
141; VI-NEXT:    flat_load_dword v0, v[0:1]
142; VI-NEXT:    s_mov_b32 s8, s0
143; VI-NEXT:    s_mov_b32 s9, s1
144; VI-NEXT:    s_mov_b32 s15, s11
145; VI-NEXT:    s_mov_b32 s6, s10
146; VI-NEXT:    s_mov_b32 s7, s11
147; VI-NEXT:    s_mov_b32 s12, s2
148; VI-NEXT:    s_mov_b32 s13, s3
149; VI-NEXT:    s_waitcnt vmcnt(0)
150; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
151; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
152; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
153; VI-NEXT:    s_endpgm
154  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
155  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
156  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
157  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
158  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
159  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
160  ret void
161}
162
163define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
164; SI-LABEL: test_copy_v4i8_x4:
165; SI:       ; %bb.0:
166; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x11
167; SI-NEXT:    s_mov_b32 s11, 0xf000
168; SI-NEXT:    s_mov_b32 s6, 0
169; SI-NEXT:    s_mov_b32 s7, s11
170; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
171; SI-NEXT:    v_mov_b32_e32 v1, 0
172; SI-NEXT:    s_waitcnt lgkmcnt(0)
173; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
174; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
175; SI-NEXT:    s_mov_b32 s10, -1
176; SI-NEXT:    s_mov_b32 s14, s10
177; SI-NEXT:    s_mov_b32 s15, s11
178; SI-NEXT:    s_mov_b32 s18, s10
179; SI-NEXT:    s_waitcnt lgkmcnt(0)
180; SI-NEXT:    s_mov_b32 s8, s0
181; SI-NEXT:    s_mov_b32 s9, s1
182; SI-NEXT:    s_mov_b32 s19, s11
183; SI-NEXT:    s_mov_b32 s22, s10
184; SI-NEXT:    s_mov_b32 s23, s11
185; SI-NEXT:    s_mov_b32 s12, s2
186; SI-NEXT:    s_mov_b32 s13, s3
187; SI-NEXT:    s_mov_b32 s16, s4
188; SI-NEXT:    s_mov_b32 s17, s5
189; SI-NEXT:    s_mov_b32 s20, s6
190; SI-NEXT:    s_mov_b32 s21, s7
191; SI-NEXT:    s_waitcnt vmcnt(0)
192; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
193; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
194; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
195; SI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
196; SI-NEXT:    s_endpgm
197;
198; VI-LABEL: test_copy_v4i8_x4:
199; VI:       ; %bb.0:
200; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x44
201; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
202; VI-NEXT:    s_mov_b32 s11, 0xf000
203; VI-NEXT:    s_mov_b32 s10, -1
204; VI-NEXT:    s_mov_b32 s14, s10
205; VI-NEXT:    s_waitcnt lgkmcnt(0)
206; VI-NEXT:    v_mov_b32_e32 v1, s3
207; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
208; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
209; VI-NEXT:    flat_load_dword v0, v[0:1]
210; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
211; VI-NEXT:    s_mov_b32 s15, s11
212; VI-NEXT:    s_mov_b32 s18, s10
213; VI-NEXT:    s_mov_b32 s19, s11
214; VI-NEXT:    s_mov_b32 s22, s10
215; VI-NEXT:    s_waitcnt lgkmcnt(0)
216; VI-NEXT:    s_mov_b32 s8, s0
217; VI-NEXT:    s_mov_b32 s9, s1
218; VI-NEXT:    s_mov_b32 s23, s11
219; VI-NEXT:    s_mov_b32 s12, s2
220; VI-NEXT:    s_mov_b32 s13, s3
221; VI-NEXT:    s_mov_b32 s16, s4
222; VI-NEXT:    s_mov_b32 s17, s5
223; VI-NEXT:    s_mov_b32 s20, s6
224; VI-NEXT:    s_mov_b32 s21, s7
225; VI-NEXT:    s_waitcnt vmcnt(0)
226; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
227; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
228; VI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
229; VI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
230; VI-NEXT:    s_endpgm
231  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
232  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
233  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
234  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
235  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
236  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
237  store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
238  ret void
239}
240
241define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
242; SI-LABEL: test_copy_v4i8_extra_use:
243; SI:       ; %bb.0:
244; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
245; SI-NEXT:    s_mov_b32 s3, 0xf000
246; SI-NEXT:    s_mov_b32 s6, 0
247; SI-NEXT:    s_mov_b32 s7, s3
248; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
249; SI-NEXT:    v_mov_b32_e32 v1, 0
250; SI-NEXT:    s_waitcnt lgkmcnt(0)
251; SI-NEXT:    buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
252; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
253; SI-NEXT:    s_mov_b32 s2, -1
254; SI-NEXT:    s_mov_b32 s10, s2
255; SI-NEXT:    s_mov_b32 s11, s3
256; SI-NEXT:    s_waitcnt lgkmcnt(0)
257; SI-NEXT:    s_mov_b32 s0, s4
258; SI-NEXT:    s_mov_b32 s1, s5
259; SI-NEXT:    s_mov_b32 s8, s6
260; SI-NEXT:    s_mov_b32 s9, s7
261; SI-NEXT:    s_waitcnt vmcnt(0)
262; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
263; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
264; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
265; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
266; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
267; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
268; SI-NEXT:    v_or_b32_e32 v2, v2, v3
269; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
270; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
271; SI-NEXT:    v_or_b32_e32 v1, v4, v1
272; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
273; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
274; SI-NEXT:    v_or_b32_e32 v1, v1, v2
275; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
276; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
277; SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
278; SI-NEXT:    s_endpgm
279;
280; VI-LABEL: test_copy_v4i8_extra_use:
281; VI:       ; %bb.0:
282; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
283; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
284; VI-NEXT:    s_mov_b32 s7, 0xf000
285; VI-NEXT:    s_mov_b32 s6, -1
286; VI-NEXT:    s_mov_b32 s10, s6
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    v_mov_b32_e32 v1, s3
289; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
290; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
291; VI-NEXT:    flat_load_dword v0, v[0:1]
292; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
293; VI-NEXT:    s_mov_b32 s11, s7
294; VI-NEXT:    s_waitcnt lgkmcnt(0)
295; VI-NEXT:    s_mov_b32 s4, s0
296; VI-NEXT:    s_mov_b32 s5, s1
297; VI-NEXT:    s_mov_b32 s8, s2
298; VI-NEXT:    s_mov_b32 s9, s3
299; VI-NEXT:    s_waitcnt vmcnt(0)
300; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
301; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
302; VI-NEXT:    v_add_u16_e32 v1, 9, v1
303; VI-NEXT:    v_add_u16_e32 v3, 9, v0
304; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
305; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
306; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
307; VI-NEXT:    v_or_b32_e32 v1, v4, v1
308; VI-NEXT:    v_or_b32_e32 v2, v2, v3
309; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
310; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
311; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
312; VI-NEXT:    v_or_b32_e32 v1, v2, v1
313; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
314; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
315; VI-NEXT:    s_endpgm
316  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
317  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
318  %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
319  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
320  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
321  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
322  ret void
323}
324
325; FIXME: Need to handle non-uniform case for function below (load without gep).
326define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
327; SI-LABEL: test_copy_v4i8_x2_extra_use:
328; SI:       ; %bb.0:
329; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
330; SI-NEXT:    s_mov_b32 s11, 0xf000
331; SI-NEXT:    s_mov_b32 s14, 0
332; SI-NEXT:    s_mov_b32 s15, s11
333; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
334; SI-NEXT:    s_waitcnt lgkmcnt(0)
335; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
336; SI-NEXT:    v_mov_b32_e32 v1, 0
337; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
338; SI-NEXT:    s_mov_b32 s10, -1
339; SI-NEXT:    s_mov_b32 s14, s10
340; SI-NEXT:    s_mov_b32 s8, s0
341; SI-NEXT:    s_mov_b32 s9, s1
342; SI-NEXT:    s_mov_b32 s12, s2
343; SI-NEXT:    s_mov_b32 s13, s3
344; SI-NEXT:    s_mov_b32 s6, s10
345; SI-NEXT:    s_mov_b32 s7, s11
346; SI-NEXT:    s_waitcnt vmcnt(0)
347; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
348; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
349; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
350; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
351; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
352; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
353; SI-NEXT:    v_or_b32_e32 v2, v2, v3
354; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
355; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
356; SI-NEXT:    v_or_b32_e32 v1, v4, v1
357; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
358; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
359; SI-NEXT:    v_or_b32_e32 v1, v1, v2
360; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
361; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
362; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
363; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
364; SI-NEXT:    s_endpgm
365;
366; VI-LABEL: test_copy_v4i8_x2_extra_use:
367; VI:       ; %bb.0:
368; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x24
369; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
370; VI-NEXT:    s_mov_b32 s11, 0xf000
371; VI-NEXT:    s_mov_b32 s10, -1
372; VI-NEXT:    s_mov_b32 s14, s10
373; VI-NEXT:    s_waitcnt lgkmcnt(0)
374; VI-NEXT:    v_mov_b32_e32 v1, s7
375; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
376; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
377; VI-NEXT:    flat_load_dword v0, v[0:1]
378; VI-NEXT:    s_mov_b32 s15, s11
379; VI-NEXT:    s_mov_b32 s8, s0
380; VI-NEXT:    s_mov_b32 s9, s1
381; VI-NEXT:    s_mov_b32 s12, s2
382; VI-NEXT:    s_mov_b32 s13, s3
383; VI-NEXT:    s_mov_b32 s6, s10
384; VI-NEXT:    s_mov_b32 s7, s11
385; VI-NEXT:    s_waitcnt vmcnt(0)
386; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
387; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
388; VI-NEXT:    v_add_u16_e32 v1, 9, v1
389; VI-NEXT:    v_add_u16_e32 v3, 9, v0
390; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
391; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
392; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
393; VI-NEXT:    v_or_b32_e32 v1, v4, v1
394; VI-NEXT:    v_or_b32_e32 v2, v2, v3
395; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
396; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
397; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
398; VI-NEXT:    v_or_b32_e32 v1, v2, v1
399; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
400; VI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
401; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
402; VI-NEXT:    s_endpgm
403  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
404  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
405  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
406  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
407  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
408  store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
409  store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
410  ret void
411}
412
413define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
414; SI-LABEL: test_copy_v3i8_align4:
415; SI:       ; %bb.0:
416; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
417; SI-NEXT:    s_mov_b32 s7, 0xf000
418; SI-NEXT:    s_mov_b32 s10, 0
419; SI-NEXT:    s_mov_b32 s11, s7
420; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
421; SI-NEXT:    s_waitcnt lgkmcnt(0)
422; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
423; SI-NEXT:    v_mov_b32_e32 v1, 0
424; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
425; SI-NEXT:    s_mov_b32 s6, -1
426; SI-NEXT:    s_mov_b32 s4, s0
427; SI-NEXT:    s_mov_b32 s5, s1
428; SI-NEXT:    s_waitcnt vmcnt(0)
429; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
430; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
431; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
432; SI-NEXT:    s_endpgm
433;
434; VI-LABEL: test_copy_v3i8_align4:
435; VI:       ; %bb.0:
436; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
437; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
438; VI-NEXT:    s_waitcnt lgkmcnt(0)
439; VI-NEXT:    v_mov_b32_e32 v1, s3
440; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
441; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
442; VI-NEXT:    flat_load_dword v0, v[0:1]
443; VI-NEXT:    s_mov_b32 s3, 0xf000
444; VI-NEXT:    s_mov_b32 s2, -1
445; VI-NEXT:    s_waitcnt vmcnt(0)
446; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
447; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
448; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
449; VI-NEXT:    s_endpgm
450  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
451  %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
452  %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
453  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
454  ret void
455}
456
457define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
458; SI-LABEL: test_copy_v3i8_align2:
459; SI:       ; %bb.0:
460; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
461; SI-NEXT:    s_mov_b32 s7, 0xf000
462; SI-NEXT:    s_mov_b32 s6, -1
463; SI-NEXT:    s_mov_b32 s10, s6
464; SI-NEXT:    s_mov_b32 s11, s7
465; SI-NEXT:    s_waitcnt lgkmcnt(0)
466; SI-NEXT:    s_mov_b32 s8, s2
467; SI-NEXT:    s_mov_b32 s9, s3
468; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
469; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
470; SI-NEXT:    s_mov_b32 s4, s0
471; SI-NEXT:    s_mov_b32 s5, s1
472; SI-NEXT:    s_waitcnt vmcnt(1)
473; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
474; SI-NEXT:    s_waitcnt vmcnt(1)
475; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
476; SI-NEXT:    s_endpgm
477;
478; VI-LABEL: test_copy_v3i8_align2:
479; VI:       ; %bb.0:
480; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
481; VI-NEXT:    s_mov_b32 s7, 0xf000
482; VI-NEXT:    s_mov_b32 s6, -1
483; VI-NEXT:    s_mov_b32 s10, s6
484; VI-NEXT:    s_mov_b32 s11, s7
485; VI-NEXT:    s_waitcnt lgkmcnt(0)
486; VI-NEXT:    s_mov_b32 s8, s2
487; VI-NEXT:    s_mov_b32 s9, s3
488; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
489; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
490; VI-NEXT:    s_mov_b32 s4, s0
491; VI-NEXT:    s_mov_b32 s5, s1
492; VI-NEXT:    s_waitcnt vmcnt(1)
493; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
494; VI-NEXT:    s_waitcnt vmcnt(1)
495; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0
496; VI-NEXT:    s_endpgm
497  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
498  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
499  ret void
500}
501
502define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
503; SI-LABEL: test_copy_v3i8_align1:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
506; SI-NEXT:    s_mov_b32 s7, 0xf000
507; SI-NEXT:    s_mov_b32 s6, -1
508; SI-NEXT:    s_mov_b32 s10, s6
509; SI-NEXT:    s_mov_b32 s11, s7
510; SI-NEXT:    s_waitcnt lgkmcnt(0)
511; SI-NEXT:    s_mov_b32 s8, s2
512; SI-NEXT:    s_mov_b32 s9, s3
513; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
514; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
515; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
516; SI-NEXT:    s_mov_b32 s4, s0
517; SI-NEXT:    s_mov_b32 s5, s1
518; SI-NEXT:    s_waitcnt vmcnt(2)
519; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
520; SI-NEXT:    s_waitcnt vmcnt(2)
521; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:1
522; SI-NEXT:    s_waitcnt vmcnt(2)
523; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:2
524; SI-NEXT:    s_endpgm
525;
526; VI-LABEL: test_copy_v3i8_align1:
527; VI:       ; %bb.0:
528; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
529; VI-NEXT:    s_mov_b32 s7, 0xf000
530; VI-NEXT:    s_mov_b32 s6, -1
531; VI-NEXT:    s_mov_b32 s10, s6
532; VI-NEXT:    s_mov_b32 s11, s7
533; VI-NEXT:    s_waitcnt lgkmcnt(0)
534; VI-NEXT:    s_mov_b32 s8, s2
535; VI-NEXT:    s_mov_b32 s9, s3
536; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
537; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
538; VI-NEXT:    s_mov_b32 s4, s0
539; VI-NEXT:    s_mov_b32 s5, s1
540; VI-NEXT:    s_waitcnt vmcnt(1)
541; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
542; VI-NEXT:    s_waitcnt vmcnt(1)
543; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
544; VI-NEXT:    v_lshrrev_b16_e32 v0, 8, v0
545; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
546; VI-NEXT:    s_endpgm
547  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
548  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
549  ret void
550}
551
552define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
553; SI-LABEL: test_copy_v4i8_volatile_load:
554; SI:       ; %bb.0:
555; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
556; SI-NEXT:    s_mov_b32 s7, 0xf000
557; SI-NEXT:    s_mov_b32 s6, -1
558; SI-NEXT:    s_mov_b32 s10, s6
559; SI-NEXT:    s_mov_b32 s11, s7
560; SI-NEXT:    s_waitcnt lgkmcnt(0)
561; SI-NEXT:    s_mov_b32 s8, s2
562; SI-NEXT:    s_mov_b32 s9, s3
563; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
564; SI-NEXT:    s_waitcnt vmcnt(0)
565; SI-NEXT:    s_mov_b32 s4, s0
566; SI-NEXT:    s_mov_b32 s5, s1
567; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
568; SI-NEXT:    s_endpgm
569;
570; VI-LABEL: test_copy_v4i8_volatile_load:
571; VI:       ; %bb.0:
572; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
573; VI-NEXT:    s_mov_b32 s7, 0xf000
574; VI-NEXT:    s_mov_b32 s6, -1
575; VI-NEXT:    s_mov_b32 s10, s6
576; VI-NEXT:    s_mov_b32 s11, s7
577; VI-NEXT:    s_waitcnt lgkmcnt(0)
578; VI-NEXT:    s_mov_b32 s8, s2
579; VI-NEXT:    s_mov_b32 s9, s3
580; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
581; VI-NEXT:    s_waitcnt vmcnt(0)
582; VI-NEXT:    s_mov_b32 s4, s0
583; VI-NEXT:    s_mov_b32 s5, s1
584; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
585; VI-NEXT:    s_endpgm
586  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
587  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
588  ret void
589}
590
591define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
592; SI-LABEL: test_copy_v4i8_volatile_store:
593; SI:       ; %bb.0:
594; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
595; SI-NEXT:    s_mov_b32 s7, 0xf000
596; SI-NEXT:    s_mov_b32 s6, -1
597; SI-NEXT:    s_mov_b32 s10, s6
598; SI-NEXT:    s_mov_b32 s11, s7
599; SI-NEXT:    s_waitcnt lgkmcnt(0)
600; SI-NEXT:    s_mov_b32 s8, s2
601; SI-NEXT:    s_mov_b32 s9, s3
602; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:3
603; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
604; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:1
605; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0
606; SI-NEXT:    s_mov_b32 s4, s0
607; SI-NEXT:    s_mov_b32 s5, s1
608; SI-NEXT:    s_waitcnt vmcnt(3)
609; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
610; SI-NEXT:    s_waitcnt vmcnt(0)
611; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
612; SI-NEXT:    s_waitcnt vmcnt(0)
613; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
614; SI-NEXT:    s_waitcnt vmcnt(0)
615; SI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
616; SI-NEXT:    s_waitcnt vmcnt(0)
617; SI-NEXT:    s_endpgm
618;
619; VI-LABEL: test_copy_v4i8_volatile_store:
620; VI:       ; %bb.0:
621; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
622; VI-NEXT:    s_mov_b32 s7, 0xf000
623; VI-NEXT:    s_mov_b32 s6, -1
624; VI-NEXT:    s_mov_b32 s10, s6
625; VI-NEXT:    s_mov_b32 s11, s7
626; VI-NEXT:    s_waitcnt lgkmcnt(0)
627; VI-NEXT:    s_mov_b32 s8, s2
628; VI-NEXT:    s_mov_b32 s9, s3
629; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:3
630; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
631; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:1
632; VI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0
633; VI-NEXT:    s_mov_b32 s4, s0
634; VI-NEXT:    s_mov_b32 s5, s1
635; VI-NEXT:    s_waitcnt vmcnt(3)
636; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
637; VI-NEXT:    s_waitcnt vmcnt(0)
638; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
639; VI-NEXT:    s_waitcnt vmcnt(0)
640; VI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
641; VI-NEXT:    s_waitcnt vmcnt(0)
642; VI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
643; VI-NEXT:    s_waitcnt vmcnt(0)
644; VI-NEXT:    s_endpgm
645  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
646  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
647  ret void
648}
649