1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
6; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
7
8define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
9; SI-LABEL: i8_arg:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
13; SI-NEXT:    s_mov_b32 s3, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_and_b32 s4, s2, 0xff
16; SI-NEXT:    s_mov_b32 s2, -1
17; SI-NEXT:    v_mov_b32_e32 v0, s4
18; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
19; SI-NEXT:    s_endpgm
20;
21; VI-LABEL: i8_arg:
22; VI:       ; %bb.0:
23; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
24; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    s_and_b32 s2, s2, 0xff
27; VI-NEXT:    v_mov_b32_e32 v0, s0
28; VI-NEXT:    v_mov_b32_e32 v1, s1
29; VI-NEXT:    v_mov_b32_e32 v2, s2
30; VI-NEXT:    flat_store_dword v[0:1], v2
31; VI-NEXT:    s_endpgm
32;
33; GFX9-LABEL: i8_arg:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
36; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
40; GFX9-NEXT:    v_mov_b32_e32 v1, s2
41; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
42; GFX9-NEXT:    s_endpgm
43;
44; EG-LABEL: i8_arg:
45; EG:       ; %bb.0:
46; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
47; EG-NEXT:    TEX 0 @6
48; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
49; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
50; EG-NEXT:    CF_END
51; EG-NEXT:    PAD
52; EG-NEXT:    Fetch clause starting at 6:
53; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
54; EG-NEXT:    ALU clause starting at 8:
55; EG-NEXT:     MOV * T0.X, 0.0,
56; EG-NEXT:    ALU clause starting at 9:
57; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
58; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
59;
60; CM-LABEL: i8_arg:
61; CM:       ; %bb.0:
62; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
63; CM-NEXT:    TEX 0 @6
64; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
65; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
66; CM-NEXT:    CF_END
67; CM-NEXT:    PAD
68; CM-NEXT:    Fetch clause starting at 6:
69; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
70; CM-NEXT:    ALU clause starting at 8:
71; CM-NEXT:     MOV * T0.X, 0.0,
72; CM-NEXT:    ALU clause starting at 9:
73; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
74; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
75  %ext = zext i8 %in to i32
76  store i32 %ext, i32 addrspace(1)* %out, align 4
77  ret void
78}
79
80define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
81; SI-LABEL: i8_zext_arg:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
84; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
85; SI-NEXT:    s_mov_b32 s3, 0xf000
86; SI-NEXT:    s_waitcnt lgkmcnt(0)
87; SI-NEXT:    s_and_b32 s4, s2, 0xff
88; SI-NEXT:    s_mov_b32 s2, -1
89; SI-NEXT:    v_mov_b32_e32 v0, s4
90; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: i8_zext_arg:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
96; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    s_and_b32 s2, s2, 0xff
99; VI-NEXT:    v_mov_b32_e32 v0, s0
100; VI-NEXT:    v_mov_b32_e32 v1, s1
101; VI-NEXT:    v_mov_b32_e32 v2, s2
102; VI-NEXT:    flat_store_dword v[0:1], v2
103; VI-NEXT:    s_endpgm
104;
105; GFX9-LABEL: i8_zext_arg:
106; GFX9:       ; %bb.0:
107; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
108; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
109; GFX9-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
112; GFX9-NEXT:    v_mov_b32_e32 v1, s2
113; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
114; GFX9-NEXT:    s_endpgm
115;
116; EG-LABEL: i8_zext_arg:
117; EG:       ; %bb.0:
118; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
119; EG-NEXT:    TEX 0 @6
120; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
122; EG-NEXT:    CF_END
123; EG-NEXT:    PAD
124; EG-NEXT:    Fetch clause starting at 6:
125; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
126; EG-NEXT:    ALU clause starting at 8:
127; EG-NEXT:     MOV * T0.X, 0.0,
128; EG-NEXT:    ALU clause starting at 9:
129; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
131; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
132;
133; CM-LABEL: i8_zext_arg:
134; CM:       ; %bb.0:
135; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
136; CM-NEXT:    TEX 0 @6
137; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
138; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
139; CM-NEXT:    CF_END
140; CM-NEXT:    PAD
141; CM-NEXT:    Fetch clause starting at 6:
142; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
143; CM-NEXT:    ALU clause starting at 8:
144; CM-NEXT:     MOV * T0.X, 0.0,
145; CM-NEXT:    ALU clause starting at 9:
146; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
147; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
148; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
149; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
150  %ext = zext i8 %in to i32
151  store i32 %ext, i32 addrspace(1)* %out, align 4
152  ret void
153}
154
155define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
156; SI-LABEL: i8_sext_arg:
157; SI:       ; %bb.0:
158; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
159; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
160; SI-NEXT:    s_mov_b32 s3, 0xf000
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    s_sext_i32_i8 s4, s2
163; SI-NEXT:    s_mov_b32 s2, -1
164; SI-NEXT:    v_mov_b32_e32 v0, s4
165; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
166; SI-NEXT:    s_endpgm
167;
168; VI-LABEL: i8_sext_arg:
169; VI:       ; %bb.0:
170; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
171; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    s_sext_i32_i8 s2, s2
174; VI-NEXT:    v_mov_b32_e32 v0, s0
175; VI-NEXT:    v_mov_b32_e32 v1, s1
176; VI-NEXT:    v_mov_b32_e32 v2, s2
177; VI-NEXT:    flat_store_dword v[0:1], v2
178; VI-NEXT:    s_endpgm
179;
180; GFX9-LABEL: i8_sext_arg:
181; GFX9:       ; %bb.0:
182; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
183; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
184; GFX9-NEXT:    v_mov_b32_e32 v0, 0
185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-NEXT:    s_sext_i32_i8 s2, s2
187; GFX9-NEXT:    v_mov_b32_e32 v1, s2
188; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
189; GFX9-NEXT:    s_endpgm
190;
191; EG-LABEL: i8_sext_arg:
192; EG:       ; %bb.0:
193; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
194; EG-NEXT:    TEX 0 @6
195; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
196; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
197; EG-NEXT:    CF_END
198; EG-NEXT:    PAD
199; EG-NEXT:    Fetch clause starting at 6:
200; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
201; EG-NEXT:    ALU clause starting at 8:
202; EG-NEXT:     MOV * T0.X, 0.0,
203; EG-NEXT:    ALU clause starting at 9:
204; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
205; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
206; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
207;
208; CM-LABEL: i8_sext_arg:
209; CM:       ; %bb.0:
210; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
211; CM-NEXT:    TEX 0 @6
212; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
213; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
214; CM-NEXT:    CF_END
215; CM-NEXT:    PAD
216; CM-NEXT:    Fetch clause starting at 6:
217; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
218; CM-NEXT:    ALU clause starting at 8:
219; CM-NEXT:     MOV * T0.X, 0.0,
220; CM-NEXT:    ALU clause starting at 9:
221; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
222; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
223; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
224; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
225  %ext = sext i8 %in to i32
226  store i32 %ext, i32 addrspace(1)* %out, align 4
227  ret void
228}
229
230define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
231; SI-LABEL: i16_arg:
232; SI:       ; %bb.0:
233; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
234; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
235; SI-NEXT:    s_mov_b32 s3, 0xf000
236; SI-NEXT:    s_waitcnt lgkmcnt(0)
237; SI-NEXT:    s_and_b32 s4, s2, 0xffff
238; SI-NEXT:    s_mov_b32 s2, -1
239; SI-NEXT:    v_mov_b32_e32 v0, s4
240; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
241; SI-NEXT:    s_endpgm
242;
243; VI-LABEL: i16_arg:
244; VI:       ; %bb.0:
245; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
246; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    s_and_b32 s2, s2, 0xffff
249; VI-NEXT:    v_mov_b32_e32 v0, s0
250; VI-NEXT:    v_mov_b32_e32 v1, s1
251; VI-NEXT:    v_mov_b32_e32 v2, s2
252; VI-NEXT:    flat_store_dword v[0:1], v2
253; VI-NEXT:    s_endpgm
254;
255; GFX9-LABEL: i16_arg:
256; GFX9:       ; %bb.0:
257; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
258; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
259; GFX9-NEXT:    v_mov_b32_e32 v0, 0
260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
262; GFX9-NEXT:    v_mov_b32_e32 v1, s2
263; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
264; GFX9-NEXT:    s_endpgm
265;
266; EG-LABEL: i16_arg:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     MOV * T0.X, 0.0,
278; EG-NEXT:    ALU clause starting at 9:
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281;
282; CM-LABEL: i16_arg:
283; CM:       ; %bb.0:
284; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
285; CM-NEXT:    TEX 0 @6
286; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
287; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
288; CM-NEXT:    CF_END
289; CM-NEXT:    PAD
290; CM-NEXT:    Fetch clause starting at 6:
291; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
292; CM-NEXT:    ALU clause starting at 8:
293; CM-NEXT:     MOV * T0.X, 0.0,
294; CM-NEXT:    ALU clause starting at 9:
295; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
296; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
297  %ext = zext i16 %in to i32
298  store i32 %ext, i32 addrspace(1)* %out, align 4
299  ret void
300}
301
302define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
303; SI-LABEL: i16_zext_arg:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
306; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
307; SI-NEXT:    s_mov_b32 s3, 0xf000
308; SI-NEXT:    s_waitcnt lgkmcnt(0)
309; SI-NEXT:    s_and_b32 s4, s2, 0xffff
310; SI-NEXT:    s_mov_b32 s2, -1
311; SI-NEXT:    v_mov_b32_e32 v0, s4
312; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
313; SI-NEXT:    s_endpgm
314;
315; VI-LABEL: i16_zext_arg:
316; VI:       ; %bb.0:
317; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
318; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    s_and_b32 s2, s2, 0xffff
321; VI-NEXT:    v_mov_b32_e32 v0, s0
322; VI-NEXT:    v_mov_b32_e32 v1, s1
323; VI-NEXT:    v_mov_b32_e32 v2, s2
324; VI-NEXT:    flat_store_dword v[0:1], v2
325; VI-NEXT:    s_endpgm
326;
327; GFX9-LABEL: i16_zext_arg:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
330; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
331; GFX9-NEXT:    v_mov_b32_e32 v0, 0
332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
334; GFX9-NEXT:    v_mov_b32_e32 v1, s2
335; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
336; GFX9-NEXT:    s_endpgm
337;
338; EG-LABEL: i16_zext_arg:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     MOV * T0.X, 0.0,
350; EG-NEXT:    ALU clause starting at 9:
351; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
352; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
353; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
354;
355; CM-LABEL: i16_zext_arg:
356; CM:       ; %bb.0:
357; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
358; CM-NEXT:    TEX 0 @6
359; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
360; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
361; CM-NEXT:    CF_END
362; CM-NEXT:    PAD
363; CM-NEXT:    Fetch clause starting at 6:
364; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
365; CM-NEXT:    ALU clause starting at 8:
366; CM-NEXT:     MOV * T0.X, 0.0,
367; CM-NEXT:    ALU clause starting at 9:
368; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
369; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
370; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
371; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
372  %ext = zext i16 %in to i32
373  store i32 %ext, i32 addrspace(1)* %out, align 4
374  ret void
375}
376
377define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
378; SI-LABEL: i16_sext_arg:
379; SI:       ; %bb.0:
380; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
381; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
382; SI-NEXT:    s_mov_b32 s3, 0xf000
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_sext_i32_i16 s4, s2
385; SI-NEXT:    s_mov_b32 s2, -1
386; SI-NEXT:    v_mov_b32_e32 v0, s4
387; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: i16_sext_arg:
391; VI:       ; %bb.0:
392; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
393; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
394; VI-NEXT:    s_waitcnt lgkmcnt(0)
395; VI-NEXT:    s_sext_i32_i16 s2, s2
396; VI-NEXT:    v_mov_b32_e32 v0, s0
397; VI-NEXT:    v_mov_b32_e32 v1, s1
398; VI-NEXT:    v_mov_b32_e32 v2, s2
399; VI-NEXT:    flat_store_dword v[0:1], v2
400; VI-NEXT:    s_endpgm
401;
402; GFX9-LABEL: i16_sext_arg:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
405; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
406; GFX9-NEXT:    v_mov_b32_e32 v0, 0
407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX9-NEXT:    s_sext_i32_i16 s2, s2
409; GFX9-NEXT:    v_mov_b32_e32 v1, s2
410; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
411; GFX9-NEXT:    s_endpgm
412;
413; EG-LABEL: i16_sext_arg:
414; EG:       ; %bb.0:
415; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
416; EG-NEXT:    TEX 0 @6
417; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
418; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
419; EG-NEXT:    CF_END
420; EG-NEXT:    PAD
421; EG-NEXT:    Fetch clause starting at 6:
422; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
423; EG-NEXT:    ALU clause starting at 8:
424; EG-NEXT:     MOV * T0.X, 0.0,
425; EG-NEXT:    ALU clause starting at 9:
426; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
427; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
428; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
429;
430; CM-LABEL: i16_sext_arg:
431; CM:       ; %bb.0:
432; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
433; CM-NEXT:    TEX 0 @6
434; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
435; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
436; CM-NEXT:    CF_END
437; CM-NEXT:    PAD
438; CM-NEXT:    Fetch clause starting at 6:
439; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
440; CM-NEXT:    ALU clause starting at 8:
441; CM-NEXT:     MOV * T0.X, 0.0,
442; CM-NEXT:    ALU clause starting at 9:
443; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
444; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
445; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
446; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
447  %ext = sext i16 %in to i32
448  store i32 %ext, i32 addrspace(1)* %out, align 4
449  ret void
450}
451
452define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
453; SI-LABEL: i32_arg:
454; SI:       ; %bb.0: ; %entry
455; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
456; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
457; SI-NEXT:    s_mov_b32 s3, 0xf000
458; SI-NEXT:    s_mov_b32 s2, -1
459; SI-NEXT:    s_waitcnt lgkmcnt(0)
460; SI-NEXT:    v_mov_b32_e32 v0, s4
461; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
462; SI-NEXT:    s_endpgm
463;
464; VI-LABEL: i32_arg:
465; VI:       ; %bb.0: ; %entry
466; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
467; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
468; VI-NEXT:    s_waitcnt lgkmcnt(0)
469; VI-NEXT:    v_mov_b32_e32 v0, s2
470; VI-NEXT:    v_mov_b32_e32 v1, s3
471; VI-NEXT:    v_mov_b32_e32 v2, s0
472; VI-NEXT:    flat_store_dword v[0:1], v2
473; VI-NEXT:    s_endpgm
474;
475; GFX9-LABEL: i32_arg:
476; GFX9:       ; %bb.0: ; %entry
477; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
478; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
479; GFX9-NEXT:    v_mov_b32_e32 v0, 0
480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX9-NEXT:    v_mov_b32_e32 v1, s2
482; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
483; GFX9-NEXT:    s_endpgm
484;
485; EG-LABEL: i32_arg:
486; EG:       ; %bb.0: ; %entry
487; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
489; EG-NEXT:    CF_END
490; EG-NEXT:    PAD
491; EG-NEXT:    ALU clause starting at 4:
492; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
493; EG-NEXT:     MOV * T1.X, KC0[2].Z,
494; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
495;
496; CM-LABEL: i32_arg:
497; CM:       ; %bb.0: ; %entry
498; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
499; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
500; CM-NEXT:    CF_END
501; CM-NEXT:    PAD
502; CM-NEXT:    ALU clause starting at 4:
503; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
504; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
505; CM-NEXT:     MOV * T1.X, KC0[2].Z,
506entry:
507  store i32 %in, i32 addrspace(1)* %out, align 4
508  ret void
509}
510
511define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
512; SI-LABEL: f32_arg:
513; SI:       ; %bb.0: ; %entry
514; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
515; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
516; SI-NEXT:    s_mov_b32 s3, 0xf000
517; SI-NEXT:    s_mov_b32 s2, -1
518; SI-NEXT:    s_waitcnt lgkmcnt(0)
519; SI-NEXT:    v_mov_b32_e32 v0, s4
520; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
521; SI-NEXT:    s_endpgm
522;
523; VI-LABEL: f32_arg:
524; VI:       ; %bb.0: ; %entry
525; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
526; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    v_mov_b32_e32 v0, s2
529; VI-NEXT:    v_mov_b32_e32 v1, s3
530; VI-NEXT:    v_mov_b32_e32 v2, s0
531; VI-NEXT:    flat_store_dword v[0:1], v2
532; VI-NEXT:    s_endpgm
533;
534; GFX9-LABEL: f32_arg:
535; GFX9:       ; %bb.0: ; %entry
536; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
537; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
538; GFX9-NEXT:    v_mov_b32_e32 v0, 0
539; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX9-NEXT:    v_mov_b32_e32 v1, s2
541; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
542; GFX9-NEXT:    s_endpgm
543;
544; EG-LABEL: f32_arg:
545; EG:       ; %bb.0: ; %entry
546; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
547; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
548; EG-NEXT:    CF_END
549; EG-NEXT:    PAD
550; EG-NEXT:    ALU clause starting at 4:
551; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
552; EG-NEXT:     MOV * T1.X, KC0[2].Z,
553; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
554;
555; CM-LABEL: f32_arg:
556; CM:       ; %bb.0: ; %entry
557; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
558; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
559; CM-NEXT:    CF_END
560; CM-NEXT:    PAD
561; CM-NEXT:    ALU clause starting at 4:
562; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
563; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
564; CM-NEXT:     MOV * T1.X, KC0[2].Z,
565entry:
566  store float %in, float addrspace(1)* %out, align 4
567  ret void
568}
569
570define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
571; SI-LABEL: v2i8_arg:
572; SI:       ; %bb.0: ; %entry
573; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
574; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
575; SI-NEXT:    s_mov_b32 s3, 0xf000
576; SI-NEXT:    s_mov_b32 s2, -1
577; SI-NEXT:    s_waitcnt lgkmcnt(0)
578; SI-NEXT:    v_mov_b32_e32 v0, s4
579; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
580; SI-NEXT:    s_endpgm
581;
582; VI-LABEL: v2i8_arg:
583; VI:       ; %bb.0: ; %entry
584; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
585; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
586; VI-NEXT:    s_waitcnt lgkmcnt(0)
587; VI-NEXT:    v_mov_b32_e32 v0, s2
588; VI-NEXT:    v_mov_b32_e32 v1, s3
589; VI-NEXT:    v_mov_b32_e32 v2, s0
590; VI-NEXT:    flat_store_short v[0:1], v2
591; VI-NEXT:    s_endpgm
592;
593; GFX9-LABEL: v2i8_arg:
594; GFX9:       ; %bb.0: ; %entry
595; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
596; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
597; GFX9-NEXT:    v_mov_b32_e32 v0, 0
598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX9-NEXT:    v_mov_b32_e32 v1, s2
600; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
601; GFX9-NEXT:    s_endpgm
602;
603; EG-LABEL: v2i8_arg:
604; EG:       ; %bb.0: ; %entry
605; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
606; EG-NEXT:    TEX 1 @6
607; EG-NEXT:    ALU 15, @11, KC0[CB0:0-32], KC1[]
608; EG-NEXT:    MEM_RAT MSKOR T4.XW, T5.X
609; EG-NEXT:    CF_END
610; EG-NEXT:    PAD
611; EG-NEXT:    Fetch clause starting at 6:
612; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
613; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
614; EG-NEXT:    ALU clause starting at 10:
615; EG-NEXT:     MOV * T4.X, 0.0,
616; EG-NEXT:    ALU clause starting at 11:
617; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
618; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
619; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
620; EG-NEXT:     AND_INT T2.W, KC0[2].Y, literal.x,
621; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
622; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
623; EG-NEXT:     AND_INT T0.W, PS, literal.x,
624; EG-NEXT:     LSHL * T1.W, PV.W, literal.y,
625; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
626; EG-NEXT:     LSHL T4.X, PV.W, PS,
627; EG-NEXT:     LSHL * T4.W, literal.x, PS,
628; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
629; EG-NEXT:     MOV T4.Y, 0.0,
630; EG-NEXT:     MOV * T4.Z, 0.0,
631; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
632; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
633;
634; CM-LABEL: v2i8_arg:
635; CM:       ; %bb.0: ; %entry
636; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
637; CM-NEXT:    TEX 1 @6
638; CM-NEXT:    ALU 15, @11, KC0[CB0:0-32], KC1[]
639; CM-NEXT:    MEM_RAT MSKOR T4.XW, T5.X
640; CM-NEXT:    CF_END
641; CM-NEXT:    PAD
642; CM-NEXT:    Fetch clause starting at 6:
643; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
644; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
645; CM-NEXT:    ALU clause starting at 10:
646; CM-NEXT:     MOV * T4.X, 0.0,
647; CM-NEXT:    ALU clause starting at 11:
648; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
649; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
650; CM-NEXT:    8(1.121039e-44), 255(3.573311e-43)
651; CM-NEXT:     AND_INT T1.Z, KC0[2].Y, literal.x,
652; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
653; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
654; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
655; CM-NEXT:     LSHL * T0.W, PV.Z, literal.y,
656; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
657; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
658; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
659; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
660; CM-NEXT:     MOV T4.Y, 0.0,
661; CM-NEXT:     MOV * T4.Z, 0.0,
662; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
663; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
664entry:
665  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
666  ret void
667}
668
669define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
670; SI-LABEL: v2i16_arg:
671; SI:       ; %bb.0: ; %entry
672; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
673; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
674; SI-NEXT:    s_mov_b32 s3, 0xf000
675; SI-NEXT:    s_mov_b32 s2, -1
676; SI-NEXT:    s_waitcnt lgkmcnt(0)
677; SI-NEXT:    v_mov_b32_e32 v0, s4
678; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
679; SI-NEXT:    s_endpgm
680;
681; VI-LABEL: v2i16_arg:
682; VI:       ; %bb.0: ; %entry
683; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
684; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
685; VI-NEXT:    s_waitcnt lgkmcnt(0)
686; VI-NEXT:    v_mov_b32_e32 v0, s2
687; VI-NEXT:    v_mov_b32_e32 v1, s3
688; VI-NEXT:    v_mov_b32_e32 v2, s0
689; VI-NEXT:    flat_store_dword v[0:1], v2
690; VI-NEXT:    s_endpgm
691;
692; GFX9-LABEL: v2i16_arg:
693; GFX9:       ; %bb.0: ; %entry
694; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
695; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
696; GFX9-NEXT:    v_mov_b32_e32 v0, 0
697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX9-NEXT:    v_mov_b32_e32 v1, s2
699; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
700; GFX9-NEXT:    s_endpgm
701;
702; EG-LABEL: v2i16_arg:
703; EG:       ; %bb.0: ; %entry
704; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
705; EG-NEXT:    TEX 1 @6
706; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
707; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
708; EG-NEXT:    CF_END
709; EG-NEXT:    PAD
710; EG-NEXT:    Fetch clause starting at 6:
711; EG-NEXT:     VTX_READ_16 T5.X, T4.X, 42, #3
712; EG-NEXT:     VTX_READ_16 T4.X, T4.X, 40, #3
713; EG-NEXT:    ALU clause starting at 10:
714; EG-NEXT:     MOV * T4.X, 0.0,
715; EG-NEXT:    ALU clause starting at 11:
716; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
717; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
718; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
719; EG-NEXT:     OR_INT T4.X, PV.W, PS,
720; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
721; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
722;
723; CM-LABEL: v2i16_arg:
724; CM:       ; %bb.0: ; %entry
725; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
726; CM-NEXT:    TEX 1 @6
727; CM-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
728; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
729; CM-NEXT:    CF_END
730; CM-NEXT:    PAD
731; CM-NEXT:    Fetch clause starting at 6:
732; CM-NEXT:     VTX_READ_16 T5.X, T4.X, 42, #3
733; CM-NEXT:     VTX_READ_16 T4.X, T4.X, 40, #3
734; CM-NEXT:    ALU clause starting at 10:
735; CM-NEXT:     MOV * T4.X, 0.0,
736; CM-NEXT:    ALU clause starting at 11:
737; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
738; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
739; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
740; CM-NEXT:     OR_INT * T4.X, PV.Z, PV.W,
741; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
742; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
743entry:
744  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
745  ret void
746}
747
748define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
749; SI-LABEL: v2i32_arg:
750; SI:       ; %bb.0: ; %entry
751; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
752; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
753; SI-NEXT:    s_mov_b32 s3, 0xf000
754; SI-NEXT:    s_mov_b32 s2, -1
755; SI-NEXT:    s_waitcnt lgkmcnt(0)
756; SI-NEXT:    v_mov_b32_e32 v0, s4
757; SI-NEXT:    v_mov_b32_e32 v1, s5
758; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
759; SI-NEXT:    s_endpgm
760;
761; VI-LABEL: v2i32_arg:
762; VI:       ; %bb.0: ; %entry
763; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
764; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
765; VI-NEXT:    s_waitcnt lgkmcnt(0)
766; VI-NEXT:    v_mov_b32_e32 v0, s2
767; VI-NEXT:    v_mov_b32_e32 v3, s1
768; VI-NEXT:    v_mov_b32_e32 v1, s3
769; VI-NEXT:    v_mov_b32_e32 v2, s0
770; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
771; VI-NEXT:    s_endpgm
772;
773; GFX9-LABEL: v2i32_arg:
774; GFX9:       ; %bb.0: ; %entry
775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
776; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
777; GFX9-NEXT:    v_mov_b32_e32 v2, 0
778; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX9-NEXT:    v_mov_b32_e32 v0, s0
780; GFX9-NEXT:    v_mov_b32_e32 v1, s1
781; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
782; GFX9-NEXT:    s_endpgm
783;
784; EG-LABEL: v2i32_arg:
785; EG:       ; %bb.0: ; %entry
786; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
787; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
788; EG-NEXT:    CF_END
789; EG-NEXT:    PAD
790; EG-NEXT:    ALU clause starting at 4:
791; EG-NEXT:     MOV * T0.Y, KC0[3].X,
792; EG-NEXT:     MOV T0.X, KC0[2].W,
793; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
794; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
795;
796; CM-LABEL: v2i32_arg:
797; CM:       ; %bb.0: ; %entry
798; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
799; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
800; CM-NEXT:    CF_END
801; CM-NEXT:    PAD
802; CM-NEXT:    ALU clause starting at 4:
803; CM-NEXT:     MOV * T0.Y, KC0[3].X,
804; CM-NEXT:     MOV * T0.X, KC0[2].W,
805; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
806; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
807entry:
808  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
809  ret void
810}
811
812define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
813; SI-LABEL: v2f32_arg:
814; SI:       ; %bb.0: ; %entry
815; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
816; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
817; SI-NEXT:    s_mov_b32 s3, 0xf000
818; SI-NEXT:    s_mov_b32 s2, -1
819; SI-NEXT:    s_waitcnt lgkmcnt(0)
820; SI-NEXT:    v_mov_b32_e32 v0, s4
821; SI-NEXT:    v_mov_b32_e32 v1, s5
822; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
823; SI-NEXT:    s_endpgm
824;
825; VI-LABEL: v2f32_arg:
826; VI:       ; %bb.0: ; %entry
827; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
828; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
829; VI-NEXT:    s_waitcnt lgkmcnt(0)
830; VI-NEXT:    v_mov_b32_e32 v0, s2
831; VI-NEXT:    v_mov_b32_e32 v3, s1
832; VI-NEXT:    v_mov_b32_e32 v1, s3
833; VI-NEXT:    v_mov_b32_e32 v2, s0
834; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
835; VI-NEXT:    s_endpgm
836;
837; GFX9-LABEL: v2f32_arg:
838; GFX9:       ; %bb.0: ; %entry
839; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
840; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
841; GFX9-NEXT:    v_mov_b32_e32 v2, 0
842; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX9-NEXT:    v_mov_b32_e32 v0, s0
844; GFX9-NEXT:    v_mov_b32_e32 v1, s1
845; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
846; GFX9-NEXT:    s_endpgm
847;
848; EG-LABEL: v2f32_arg:
849; EG:       ; %bb.0: ; %entry
850; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
851; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
852; EG-NEXT:    CF_END
853; EG-NEXT:    PAD
854; EG-NEXT:    ALU clause starting at 4:
855; EG-NEXT:     MOV * T0.Y, KC0[3].X,
856; EG-NEXT:     MOV T0.X, KC0[2].W,
857; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
858; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
859;
860; CM-LABEL: v2f32_arg:
861; CM:       ; %bb.0: ; %entry
862; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
863; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
864; CM-NEXT:    CF_END
865; CM-NEXT:    PAD
866; CM-NEXT:    ALU clause starting at 4:
867; CM-NEXT:     MOV * T0.Y, KC0[3].X,
868; CM-NEXT:     MOV * T0.X, KC0[2].W,
869; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
870; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
871entry:
872  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
873  ret void
874}
875
876define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
877; SI-LABEL: v3i8_arg:
878; SI:       ; %bb.0: ; %entry
879; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
880; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
881; SI-NEXT:    s_mov_b32 s3, 0xf000
882; SI-NEXT:    s_waitcnt lgkmcnt(0)
883; SI-NEXT:    s_lshr_b32 s5, s4, 16
884; SI-NEXT:    s_mov_b32 s2, -1
885; SI-NEXT:    v_mov_b32_e32 v0, s4
886; SI-NEXT:    v_mov_b32_e32 v1, s5
887; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
888; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
889; SI-NEXT:    s_endpgm
890;
891; VI-LABEL: v3i8_arg:
892; VI:       ; %bb.0: ; %entry
893; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
894; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
895; VI-NEXT:    s_waitcnt lgkmcnt(0)
896; VI-NEXT:    s_lshr_b32 s3, s2, 16
897; VI-NEXT:    v_mov_b32_e32 v0, s0
898; VI-NEXT:    v_mov_b32_e32 v1, s1
899; VI-NEXT:    s_add_u32 s0, s0, 2
900; VI-NEXT:    s_addc_u32 s1, s1, 0
901; VI-NEXT:    v_mov_b32_e32 v3, s1
902; VI-NEXT:    v_mov_b32_e32 v5, s3
903; VI-NEXT:    v_mov_b32_e32 v2, s0
904; VI-NEXT:    v_mov_b32_e32 v4, s2
905; VI-NEXT:    flat_store_byte v[2:3], v5
906; VI-NEXT:    flat_store_short v[0:1], v4
907; VI-NEXT:    s_endpgm
908;
909; GFX9-LABEL: v3i8_arg:
910; GFX9:       ; %bb.0: ; %entry
911; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
912; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
913; GFX9-NEXT:    v_mov_b32_e32 v0, 0
914; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NEXT:    v_mov_b32_e32 v1, s2
916; GFX9-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:2
917; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
918; GFX9-NEXT:    s_endpgm
919;
920; EG-LABEL: v3i8_arg:
921; EG:       ; %bb.0: ; %entry
922; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
923; EG-NEXT:    TEX 2 @6
924; EG-NEXT:    ALU 28, @13, KC0[CB0:0-32], KC1[]
925; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
926; EG-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
927; EG-NEXT:    CF_END
928; EG-NEXT:    Fetch clause starting at 6:
929; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
930; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
931; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
932; EG-NEXT:    ALU clause starting at 12:
933; EG-NEXT:     MOV * T4.X, 0.0,
934; EG-NEXT:    ALU clause starting at 13:
935; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
936; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
937; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
938; EG-NEXT:     AND_INT T2.W, KC0[2].Y, literal.x,
939; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
940; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
941; EG-NEXT:     AND_INT T0.W, PS, literal.x,
942; EG-NEXT:     LSHL * T1.W, PV.W, literal.y,
943; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
944; EG-NEXT:     LSHL T4.X, PV.W, PS,
945; EG-NEXT:     LSHL * T4.W, literal.x, PS,
946; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
947; EG-NEXT:     MOV T4.Y, 0.0,
948; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
949; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
950; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
951; EG-NEXT:     AND_INT * T2.W, T6.X, literal.y,
952; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
953; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
954; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
955; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
956; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
957; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
958; EG-NEXT:     MOV T5.Y, 0.0,
959; EG-NEXT:     MOV T4.Z, 0.0,
960; EG-NEXT:     MOV * T5.Z, 0.0,
961; EG-NEXT:     LSHR T6.X, T0.W, literal.x,
962; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
963; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
964;
965; CM-LABEL: v3i8_arg:
966; CM:       ; %bb.0: ; %entry
967; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
968; CM-NEXT:    TEX 2 @6
969; CM-NEXT:    ALU 29, @13, KC0[CB0:0-32], KC1[]
970; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
971; CM-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
972; CM-NEXT:    CF_END
973; CM-NEXT:    Fetch clause starting at 6:
974; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
975; CM-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
976; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
977; CM-NEXT:    ALU clause starting at 12:
978; CM-NEXT:     MOV * T4.X, 0.0,
979; CM-NEXT:    ALU clause starting at 13:
980; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
981; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
982; CM-NEXT:    8(1.121039e-44), 255(3.573311e-43)
983; CM-NEXT:     AND_INT T1.Z, KC0[2].Y, literal.x,
984; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
985; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
986; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
987; CM-NEXT:     LSHL * T0.W, PV.Z, literal.y,
988; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
989; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
990; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
991; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
992; CM-NEXT:     MOV T4.Y, 0.0,
993; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
994; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
995; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
996; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
997; CM-NEXT:     AND_INT T0.Z, T6.X, literal.x,
998; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
999; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1000; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1001; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1002; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1003; CM-NEXT:     MOV T5.Y, 0.0,
1004; CM-NEXT:     MOV * T4.Z, 0.0,
1005; CM-NEXT:     MOV * T5.Z, 0.0,
1006; CM-NEXT:     LSHR * T6.X, T0.W, literal.x,
1007; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1008; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1009; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1010entry:
1011  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
1012  ret void
1013}
1014
1015define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
1016; SI-LABEL: v3i16_arg:
1017; SI:       ; %bb.0: ; %entry
1018; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1019; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1020; SI-NEXT:    s_mov_b32 s3, 0xf000
1021; SI-NEXT:    s_mov_b32 s2, -1
1022; SI-NEXT:    s_waitcnt lgkmcnt(0)
1023; SI-NEXT:    v_mov_b32_e32 v0, s5
1024; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1025; SI-NEXT:    s_waitcnt expcnt(0)
1026; SI-NEXT:    v_mov_b32_e32 v0, s4
1027; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1028; SI-NEXT:    s_endpgm
1029;
1030; VI-LABEL: v3i16_arg:
1031; VI:       ; %bb.0: ; %entry
1032; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1033; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1034; VI-NEXT:    s_waitcnt lgkmcnt(0)
1035; VI-NEXT:    s_add_u32 s4, s2, 4
1036; VI-NEXT:    s_addc_u32 s5, s3, 0
1037; VI-NEXT:    v_mov_b32_e32 v2, s4
1038; VI-NEXT:    v_mov_b32_e32 v4, s1
1039; VI-NEXT:    v_mov_b32_e32 v0, s2
1040; VI-NEXT:    v_mov_b32_e32 v3, s5
1041; VI-NEXT:    v_mov_b32_e32 v1, s3
1042; VI-NEXT:    v_mov_b32_e32 v5, s0
1043; VI-NEXT:    flat_store_short v[2:3], v4
1044; VI-NEXT:    flat_store_dword v[0:1], v5
1045; VI-NEXT:    s_endpgm
1046;
1047; GFX9-LABEL: v3i16_arg:
1048; GFX9:       ; %bb.0: ; %entry
1049; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1050; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1051; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1054; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1055; GFX9-NEXT:    global_store_short v0, v1, s[2:3] offset:4
1056; GFX9-NEXT:    global_store_dword v0, v2, s[2:3]
1057; GFX9-NEXT:    s_endpgm
1058;
1059; EG-LABEL: v3i16_arg:
1060; EG:       ; %bb.0: ; %entry
1061; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
1062; EG-NEXT:    TEX 2 @6
1063; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1064; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
1065; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1066; EG-NEXT:    CF_END
1067; EG-NEXT:    Fetch clause starting at 6:
1068; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1069; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1070; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1071; EG-NEXT:    ALU clause starting at 12:
1072; EG-NEXT:     MOV * T5.X, 0.0,
1073; EG-NEXT:    ALU clause starting at 13:
1074; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1075; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1076; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1077; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1078; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1079; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1080; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1081; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1082; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1083; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1084; EG-NEXT:     MOV T5.Y, 0.0,
1085; EG-NEXT:     MOV * T5.Z, 0.0,
1086; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
1087; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
1088; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
1089; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1090; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1091; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1092; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1093; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1094;
1095; CM-LABEL: v3i16_arg:
1096; CM:       ; %bb.0: ; %entry
1097; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
1098; CM-NEXT:    TEX 2 @6
1099; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1100; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1101; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
1102; CM-NEXT:    CF_END
1103; CM-NEXT:    Fetch clause starting at 6:
1104; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1105; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1106; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1107; CM-NEXT:    ALU clause starting at 12:
1108; CM-NEXT:     MOV * T5.X, 0.0,
1109; CM-NEXT:    ALU clause starting at 13:
1110; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1111; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1112; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1113; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1114; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1115; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1116; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1117; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1118; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1119; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1120; CM-NEXT:     MOV T5.Y, 0.0,
1121; CM-NEXT:     MOV * T5.Z, 0.0,
1122; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
1123; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
1124; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1125; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1126; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1127; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1128; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
1129; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1130entry:
1131  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
1132  ret void
1133}
1134
1135define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
1136; SI-LABEL: v3i32_arg:
1137; SI:       ; %bb.0: ; %entry
1138; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1139; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1140; SI-NEXT:    s_mov_b32 s3, 0xf000
1141; SI-NEXT:    s_mov_b32 s2, -1
1142; SI-NEXT:    s_waitcnt lgkmcnt(0)
1143; SI-NEXT:    v_mov_b32_e32 v0, s6
1144; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
1145; SI-NEXT:    s_waitcnt expcnt(0)
1146; SI-NEXT:    v_mov_b32_e32 v0, s4
1147; SI-NEXT:    v_mov_b32_e32 v1, s5
1148; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1149; SI-NEXT:    s_endpgm
1150;
1151; VI-LABEL: v3i32_arg:
1152; VI:       ; %bb.0: ; %entry
1153; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1154; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1155; VI-NEXT:    s_waitcnt lgkmcnt(0)
1156; VI-NEXT:    v_mov_b32_e32 v0, s4
1157; VI-NEXT:    v_mov_b32_e32 v4, s1
1158; VI-NEXT:    v_mov_b32_e32 v1, s5
1159; VI-NEXT:    v_mov_b32_e32 v2, s6
1160; VI-NEXT:    v_mov_b32_e32 v3, s0
1161; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1162; VI-NEXT:    s_endpgm
1163;
1164; GFX9-LABEL: v3i32_arg:
1165; GFX9:       ; %bb.0: ; %entry
1166; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1167; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1168; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1171; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1172; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1173; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
1174; GFX9-NEXT:    s_endpgm
1175;
1176; EG-LABEL: v3i32_arg:
1177; EG:       ; %bb.0: ; %entry
1178; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1179; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1180; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1181; EG-NEXT:    CF_END
1182; EG-NEXT:    ALU clause starting at 4:
1183; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1184; EG-NEXT:     MOV T0.X, KC0[3].Y,
1185; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1186; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1187; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1188; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1189; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1190; EG-NEXT:     MOV * T3.X, KC0[3].W,
1191; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1192;
1193; CM-LABEL: v3i32_arg:
1194; CM:       ; %bb.0: ; %entry
1195; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1196; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1197; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1198; CM-NEXT:    CF_END
1199; CM-NEXT:    ALU clause starting at 4:
1200; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1201; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1202; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1203; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1204; CM-NEXT:     MOV T1.X, KC0[3].W,
1205; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1206; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1207; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1208; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1209entry:
1210  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
1211  ret void
1212}
1213
1214define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
1215; SI-LABEL: v3f32_arg:
1216; SI:       ; %bb.0: ; %entry
1217; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1218; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1219; SI-NEXT:    s_mov_b32 s3, 0xf000
1220; SI-NEXT:    s_mov_b32 s2, -1
1221; SI-NEXT:    s_waitcnt lgkmcnt(0)
1222; SI-NEXT:    v_mov_b32_e32 v0, s6
1223; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
1224; SI-NEXT:    s_waitcnt expcnt(0)
1225; SI-NEXT:    v_mov_b32_e32 v0, s4
1226; SI-NEXT:    v_mov_b32_e32 v1, s5
1227; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1228; SI-NEXT:    s_endpgm
1229;
1230; VI-LABEL: v3f32_arg:
1231; VI:       ; %bb.0: ; %entry
1232; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1233; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1234; VI-NEXT:    s_waitcnt lgkmcnt(0)
1235; VI-NEXT:    v_mov_b32_e32 v0, s4
1236; VI-NEXT:    v_mov_b32_e32 v4, s1
1237; VI-NEXT:    v_mov_b32_e32 v1, s5
1238; VI-NEXT:    v_mov_b32_e32 v2, s6
1239; VI-NEXT:    v_mov_b32_e32 v3, s0
1240; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1241; VI-NEXT:    s_endpgm
1242;
1243; GFX9-LABEL: v3f32_arg:
1244; GFX9:       ; %bb.0: ; %entry
1245; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1246; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1247; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1250; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1251; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1252; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
1253; GFX9-NEXT:    s_endpgm
1254;
1255; EG-LABEL: v3f32_arg:
1256; EG:       ; %bb.0: ; %entry
1257; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1258; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1259; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1260; EG-NEXT:    CF_END
1261; EG-NEXT:    ALU clause starting at 4:
1262; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1263; EG-NEXT:     MOV T0.X, KC0[3].Y,
1264; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1265; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1266; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1267; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1268; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1269; EG-NEXT:     MOV * T3.X, KC0[3].W,
1270; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1271;
1272; CM-LABEL: v3f32_arg:
1273; CM:       ; %bb.0: ; %entry
1274; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1275; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1276; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1277; CM-NEXT:    CF_END
1278; CM-NEXT:    ALU clause starting at 4:
1279; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1280; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1281; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1282; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1283; CM-NEXT:     MOV T1.X, KC0[3].W,
1284; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1285; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1286; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1287; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1288entry:
1289  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
1290  ret void
1291}
1292
1293define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
1294; SI-LABEL: v4i8_arg:
1295; SI:       ; %bb.0: ; %entry
1296; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
1297; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1298; SI-NEXT:    s_mov_b32 s3, 0xf000
1299; SI-NEXT:    s_mov_b32 s2, -1
1300; SI-NEXT:    s_waitcnt lgkmcnt(0)
1301; SI-NEXT:    v_mov_b32_e32 v0, s4
1302; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1303; SI-NEXT:    s_endpgm
1304;
1305; VI-LABEL: v4i8_arg:
1306; VI:       ; %bb.0: ; %entry
1307; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1308; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1309; VI-NEXT:    s_waitcnt lgkmcnt(0)
1310; VI-NEXT:    v_mov_b32_e32 v0, s2
1311; VI-NEXT:    v_mov_b32_e32 v1, s3
1312; VI-NEXT:    v_mov_b32_e32 v2, s0
1313; VI-NEXT:    flat_store_dword v[0:1], v2
1314; VI-NEXT:    s_endpgm
1315;
1316; GFX9-LABEL: v4i8_arg:
1317; GFX9:       ; %bb.0: ; %entry
1318; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
1319; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1320; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1321; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1323; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1324; GFX9-NEXT:    s_endpgm
1325;
1326; EG-LABEL: v4i8_arg:
1327; EG:       ; %bb.0: ; %entry
1328; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1329; EG-NEXT:    TEX 3 @6
1330; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
1331; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
1332; EG-NEXT:    CF_END
1333; EG-NEXT:    PAD
1334; EG-NEXT:    Fetch clause starting at 6:
1335; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 42, #3
1336; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 40, #3
1337; EG-NEXT:     VTX_READ_8 T7.X, T4.X, 43, #3
1338; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 41, #3
1339; EG-NEXT:    ALU clause starting at 14:
1340; EG-NEXT:     MOV * T4.X, 0.0,
1341; EG-NEXT:    ALU clause starting at 15:
1342; EG-NEXT:     AND_INT * T0.W, T5.X, literal.x,
1343; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1344; EG-NEXT:     AND_INT T0.Z, T4.X, literal.x,
1345; EG-NEXT:     LSHL T0.W, PV.W, literal.y,
1346; EG-NEXT:     LSHL * T1.W, T7.X, literal.z,
1347; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1348; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1349; EG-NEXT:     OR_INT T0.W, PS, PV.W,
1350; EG-NEXT:     LSHL * T1.W, PV.Z, literal.x,
1351; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1352; EG-NEXT:     OR_INT T0.W, PV.W, PS,
1353; EG-NEXT:     AND_INT * T1.W, T6.X, literal.x,
1354; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1355; EG-NEXT:     OR_INT T4.X, PV.W, PS,
1356; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
1357; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1358;
1359; CM-LABEL: v4i8_arg:
1360; CM:       ; %bb.0: ; %entry
1361; CM-NEXT:    ALU 0, @14, KC0[], KC1[]
1362; CM-NEXT:    TEX 3 @6
1363; CM-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
1364; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
1365; CM-NEXT:    CF_END
1366; CM-NEXT:    PAD
1367; CM-NEXT:    Fetch clause starting at 6:
1368; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 42, #3
1369; CM-NEXT:     VTX_READ_8 T6.X, T4.X, 40, #3
1370; CM-NEXT:     VTX_READ_8 T7.X, T4.X, 43, #3
1371; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 41, #3
1372; CM-NEXT:    ALU clause starting at 14:
1373; CM-NEXT:     MOV * T4.X, 0.0,
1374; CM-NEXT:    ALU clause starting at 15:
1375; CM-NEXT:     AND_INT * T0.W, T5.X, literal.x,
1376; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1377; CM-NEXT:     AND_INT T0.Y, T4.X, literal.x,
1378; CM-NEXT:     LSHL T0.Z, PV.W, literal.y,
1379; CM-NEXT:     LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212
1380; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1381; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1382; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
1383; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
1384; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1385; CM-NEXT:     OR_INT T0.Z, PV.Z, PV.W,
1386; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
1387; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1388; CM-NEXT:     OR_INT * T4.X, PV.Z, PV.W,
1389; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
1390; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1391entry:
1392  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
1393  ret void
1394}
1395
1396define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
1397; SI-LABEL: v4i16_arg:
1398; SI:       ; %bb.0: ; %entry
1399; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1400; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1401; SI-NEXT:    s_mov_b32 s3, 0xf000
1402; SI-NEXT:    s_mov_b32 s2, -1
1403; SI-NEXT:    s_waitcnt lgkmcnt(0)
1404; SI-NEXT:    v_mov_b32_e32 v0, s4
1405; SI-NEXT:    v_mov_b32_e32 v1, s5
1406; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1407; SI-NEXT:    s_endpgm
1408;
1409; VI-LABEL: v4i16_arg:
1410; VI:       ; %bb.0: ; %entry
1411; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1412; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1413; VI-NEXT:    s_waitcnt lgkmcnt(0)
1414; VI-NEXT:    v_mov_b32_e32 v0, s2
1415; VI-NEXT:    v_mov_b32_e32 v3, s1
1416; VI-NEXT:    v_mov_b32_e32 v1, s3
1417; VI-NEXT:    v_mov_b32_e32 v2, s0
1418; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1419; VI-NEXT:    s_endpgm
1420;
1421; GFX9-LABEL: v4i16_arg:
1422; GFX9:       ; %bb.0: ; %entry
1423; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1424; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1425; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1428; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1429; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
1430; GFX9-NEXT:    s_endpgm
1431;
1432; EG-LABEL: v4i16_arg:
1433; EG:       ; %bb.0: ; %entry
1434; EG-NEXT:    ALU 1, @20, KC0[], KC1[]
1435; EG-NEXT:    TEX 0 @12
1436; EG-NEXT:    ALU 5, @22, KC0[], KC1[]
1437; EG-NEXT:    TEX 0 @14
1438; EG-NEXT:    ALU 5, @28, KC0[], KC1[]
1439; EG-NEXT:    TEX 0 @16
1440; EG-NEXT:    ALU 5, @34, KC0[], KC1[]
1441; EG-NEXT:    TEX 0 @18
1442; EG-NEXT:    ALU 7, @40, KC0[CB0:0-32], KC1[]
1443; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
1444; EG-NEXT:    CF_END
1445; EG-NEXT:    PAD
1446; EG-NEXT:    Fetch clause starting at 12:
1447; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
1448; EG-NEXT:    Fetch clause starting at 14:
1449; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
1450; EG-NEXT:    Fetch clause starting at 16:
1451; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
1452; EG-NEXT:    Fetch clause starting at 18:
1453; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 44, #3
1454; EG-NEXT:    ALU clause starting at 20:
1455; EG-NEXT:     MOV * T0.Y, T3.X,
1456; EG-NEXT:     MOV * T5.X, 0.0,
1457; EG-NEXT:    ALU clause starting at 22:
1458; EG-NEXT:     LSHL T0.W, T6.X, literal.x,
1459; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
1460; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1461; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
1462; EG-NEXT:     MOV * T3.X, PV.W,
1463; EG-NEXT:     MOV * T0.Y, PV.X,
1464; EG-NEXT:    ALU clause starting at 28:
1465; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
1466; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
1467; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
1468; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
1469; EG-NEXT:     MOV T3.X, PV.W,
1470; EG-NEXT:     MOV * T0.Y, T2.X,
1471; EG-NEXT:    ALU clause starting at 34:
1472; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
1473; EG-NEXT:     LSHL * T1.W, T6.X, literal.y,
1474; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1475; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
1476; EG-NEXT:     MOV * T2.X, PV.W,
1477; EG-NEXT:     MOV * T0.Y, PV.X,
1478; EG-NEXT:    ALU clause starting at 40:
1479; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
1480; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
1481; EG-NEXT:     AND_INT * T1.W, T5.X, literal.z,
1482; EG-NEXT:    2(2.802597e-45), -65536(nan)
1483; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1484; EG-NEXT:     OR_INT * T5.X, PV.W, PS,
1485; EG-NEXT:     MOV T2.X, PV.X,
1486; EG-NEXT:     MOV * T5.Y, T3.X,
1487;
1488; CM-LABEL: v4i16_arg:
1489; CM:       ; %bb.0: ; %entry
1490; CM-NEXT:    ALU 1, @20, KC0[], KC1[]
1491; CM-NEXT:    TEX 0 @12
1492; CM-NEXT:    ALU 5, @22, KC0[], KC1[]
1493; CM-NEXT:    TEX 0 @14
1494; CM-NEXT:    ALU 5, @28, KC0[], KC1[]
1495; CM-NEXT:    TEX 0 @16
1496; CM-NEXT:    ALU 5, @34, KC0[], KC1[]
1497; CM-NEXT:    TEX 0 @18
1498; CM-NEXT:    ALU 7, @40, KC0[CB0:0-32], KC1[]
1499; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
1500; CM-NEXT:    CF_END
1501; CM-NEXT:    PAD
1502; CM-NEXT:    Fetch clause starting at 12:
1503; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
1504; CM-NEXT:    Fetch clause starting at 14:
1505; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
1506; CM-NEXT:    Fetch clause starting at 16:
1507; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
1508; CM-NEXT:    Fetch clause starting at 18:
1509; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 44, #3
1510; CM-NEXT:    ALU clause starting at 20:
1511; CM-NEXT:     MOV * T0.Y, T3.X,
1512; CM-NEXT:     MOV * T5.X, 0.0,
1513; CM-NEXT:    ALU clause starting at 22:
1514; CM-NEXT:     LSHL T0.Z, T6.X, literal.x,
1515; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
1516; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1517; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
1518; CM-NEXT:     MOV * T3.X, PV.W,
1519; CM-NEXT:     MOV * T0.Y, PV.X,
1520; CM-NEXT:    ALU clause starting at 28:
1521; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
1522; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
1523; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
1524; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
1525; CM-NEXT:     MOV T3.X, PV.W,
1526; CM-NEXT:     MOV * T0.Y, T2.X,
1527; CM-NEXT:    ALU clause starting at 34:
1528; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
1529; CM-NEXT:     LSHL * T0.W, T6.X, literal.y,
1530; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1531; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
1532; CM-NEXT:     MOV * T2.X, PV.W,
1533; CM-NEXT:     MOV * T0.Y, PV.X,
1534; CM-NEXT:    ALU clause starting at 40:
1535; CM-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
1536; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
1537; CM-NEXT:     AND_INT * T0.W, T5.X, literal.z,
1538; CM-NEXT:    2(2.802597e-45), -65536(nan)
1539; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1540; CM-NEXT:     OR_INT * T5.X, PV.Z, PV.W,
1541; CM-NEXT:     MOV T2.X, PV.X,
1542; CM-NEXT:     MOV * T5.Y, T3.X,
1543entry:
1544  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
1545  ret void
1546}
1547
1548define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
1549; SI-LABEL: v4i32_arg:
1550; SI:       ; %bb.0: ; %entry
1551; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1552; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1553; SI-NEXT:    s_mov_b32 s3, 0xf000
1554; SI-NEXT:    s_mov_b32 s2, -1
1555; SI-NEXT:    s_waitcnt lgkmcnt(0)
1556; SI-NEXT:    v_mov_b32_e32 v0, s4
1557; SI-NEXT:    v_mov_b32_e32 v1, s5
1558; SI-NEXT:    v_mov_b32_e32 v2, s6
1559; SI-NEXT:    v_mov_b32_e32 v3, s7
1560; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1561; SI-NEXT:    s_endpgm
1562;
1563; VI-LABEL: v4i32_arg:
1564; VI:       ; %bb.0: ; %entry
1565; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1566; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1567; VI-NEXT:    s_waitcnt lgkmcnt(0)
1568; VI-NEXT:    v_mov_b32_e32 v4, s4
1569; VI-NEXT:    v_mov_b32_e32 v0, s0
1570; VI-NEXT:    v_mov_b32_e32 v5, s5
1571; VI-NEXT:    v_mov_b32_e32 v1, s1
1572; VI-NEXT:    v_mov_b32_e32 v2, s2
1573; VI-NEXT:    v_mov_b32_e32 v3, s3
1574; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1575; VI-NEXT:    s_endpgm
1576;
1577; GFX9-LABEL: v4i32_arg:
1578; GFX9:       ; %bb.0: ; %entry
1579; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1580; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1581; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1584; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1585; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1586; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1587; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
1588; GFX9-NEXT:    s_endpgm
1589;
1590; EG-LABEL: v4i32_arg:
1591; EG:       ; %bb.0: ; %entry
1592; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1594; EG-NEXT:    CF_END
1595; EG-NEXT:    PAD
1596; EG-NEXT:    ALU clause starting at 4:
1597; EG-NEXT:     MOV * T0.W, KC0[4].X,
1598; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1599; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1600; EG-NEXT:     MOV T0.X, KC0[3].Y,
1601; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1603;
1604; CM-LABEL: v4i32_arg:
1605; CM:       ; %bb.0: ; %entry
1606; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1607; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1608; CM-NEXT:    CF_END
1609; CM-NEXT:    PAD
1610; CM-NEXT:    ALU clause starting at 4:
1611; CM-NEXT:     MOV * T0.W, KC0[4].X,
1612; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1613; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1614; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1615; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1616; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1617entry:
1618  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
1619  ret void
1620}
1621
1622define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
1623; SI-LABEL: v4f32_arg:
1624; SI:       ; %bb.0: ; %entry
1625; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1626; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1627; SI-NEXT:    s_mov_b32 s3, 0xf000
1628; SI-NEXT:    s_mov_b32 s2, -1
1629; SI-NEXT:    s_waitcnt lgkmcnt(0)
1630; SI-NEXT:    v_mov_b32_e32 v0, s4
1631; SI-NEXT:    v_mov_b32_e32 v1, s5
1632; SI-NEXT:    v_mov_b32_e32 v2, s6
1633; SI-NEXT:    v_mov_b32_e32 v3, s7
1634; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1635; SI-NEXT:    s_endpgm
1636;
1637; VI-LABEL: v4f32_arg:
1638; VI:       ; %bb.0: ; %entry
1639; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1640; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1641; VI-NEXT:    s_waitcnt lgkmcnt(0)
1642; VI-NEXT:    v_mov_b32_e32 v4, s4
1643; VI-NEXT:    v_mov_b32_e32 v0, s0
1644; VI-NEXT:    v_mov_b32_e32 v5, s5
1645; VI-NEXT:    v_mov_b32_e32 v1, s1
1646; VI-NEXT:    v_mov_b32_e32 v2, s2
1647; VI-NEXT:    v_mov_b32_e32 v3, s3
1648; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1649; VI-NEXT:    s_endpgm
1650;
1651; GFX9-LABEL: v4f32_arg:
1652; GFX9:       ; %bb.0: ; %entry
1653; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1654; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1655; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1658; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1659; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1660; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1661; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
1662; GFX9-NEXT:    s_endpgm
1663;
1664; EG-LABEL: v4f32_arg:
1665; EG:       ; %bb.0: ; %entry
1666; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1667; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1668; EG-NEXT:    CF_END
1669; EG-NEXT:    PAD
1670; EG-NEXT:    ALU clause starting at 4:
1671; EG-NEXT:     MOV * T0.W, KC0[4].X,
1672; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1673; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1674; EG-NEXT:     MOV T0.X, KC0[3].Y,
1675; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1676; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1677;
1678; CM-LABEL: v4f32_arg:
1679; CM:       ; %bb.0: ; %entry
1680; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1681; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1682; CM-NEXT:    CF_END
1683; CM-NEXT:    PAD
1684; CM-NEXT:    ALU clause starting at 4:
1685; CM-NEXT:     MOV * T0.W, KC0[4].X,
1686; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1687; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1688; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1689; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1690; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1691entry:
1692  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
1693  ret void
1694}
1695
1696define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
1697; SI-LABEL: v5i8_arg:
1698; SI:       ; %bb.0: ; %entry
1699; SI-NEXT:    s_load_dword s2, s[0:1], 0xc
1700; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1701; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
1702; SI-NEXT:    s_mov_b32 s7, 0xf000
1703; SI-NEXT:    s_mov_b32 s6, -1
1704; SI-NEXT:    s_waitcnt lgkmcnt(0)
1705; SI-NEXT:    v_mov_b32_e32 v0, s2
1706; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:4
1707; SI-NEXT:    s_waitcnt expcnt(0)
1708; SI-NEXT:    v_mov_b32_e32 v0, s0
1709; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1710; SI-NEXT:    s_endpgm
1711;
1712; VI-LABEL: v5i8_arg:
1713; VI:       ; %bb.0: ; %entry
1714; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1715; VI-NEXT:    s_load_dword s4, s[0:1], 0x30
1716; VI-NEXT:    s_load_dword s5, s[0:1], 0x2c
1717; VI-NEXT:    s_waitcnt lgkmcnt(0)
1718; VI-NEXT:    s_add_u32 s0, s2, 4
1719; VI-NEXT:    s_addc_u32 s1, s3, 0
1720; VI-NEXT:    v_mov_b32_e32 v3, s1
1721; VI-NEXT:    v_mov_b32_e32 v4, s4
1722; VI-NEXT:    v_mov_b32_e32 v0, s2
1723; VI-NEXT:    v_mov_b32_e32 v2, s0
1724; VI-NEXT:    v_mov_b32_e32 v1, s3
1725; VI-NEXT:    flat_store_byte v[2:3], v4
1726; VI-NEXT:    v_mov_b32_e32 v2, s5
1727; VI-NEXT:    flat_store_dword v[0:1], v2
1728; VI-NEXT:    s_endpgm
1729;
1730; GFX9-LABEL: v5i8_arg:
1731; GFX9:       ; %bb.0: ; %entry
1732; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
1733; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
1734; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1735; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1737; GFX9-NEXT:    v_mov_b32_e32 v2, s0
1738; GFX9-NEXT:    global_store_byte v0, v1, s[2:3] offset:4
1739; GFX9-NEXT:    global_store_dword v0, v2, s[2:3]
1740; GFX9-NEXT:    s_endpgm
1741;
1742; EG-LABEL: v5i8_arg:
1743; EG:       ; %bb.0: ; %entry
1744; EG-NEXT:    ALU 0, @16, KC0[], KC1[]
1745; EG-NEXT:    TEX 4 @6
1746; EG-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1747; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1748; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
1749; EG-NEXT:    CF_END
1750; EG-NEXT:    Fetch clause starting at 6:
1751; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1752; EG-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1753; EG-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1754; EG-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1755; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1756; EG-NEXT:    ALU clause starting at 16:
1757; EG-NEXT:     MOV * T5.X, 0.0,
1758; EG-NEXT:    ALU clause starting at 17:
1759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1760; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1761; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1762; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1763; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
1764; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1765; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1766; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1767; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1768; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1769; EG-NEXT:     MOV T5.Y, 0.0,
1770; EG-NEXT:     MOV T5.Z, 0.0,
1771; EG-NEXT:     AND_INT T1.W, T9.X, literal.x,
1772; EG-NEXT:     AND_INT * T0.Z, T8.X, literal.x,
1773; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1774; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
1775; EG-NEXT:     LSHL * T2.W, T7.X, literal.y,
1776; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
1777; EG-NEXT:     OR_INT T1.W, PS, PV.W,
1778; EG-NEXT:     LSHL * T2.W, T0.Z, literal.x,
1779; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1780; EG-NEXT:     OR_INT T1.W, PV.W, PS,
1781; EG-NEXT:     AND_INT * T2.W, T6.X, literal.x,
1782; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1783; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1784; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1785; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1786; EG-NEXT:     LSHR * T8.X, T0.W, literal.x,
1787; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1788;
1789; CM-LABEL: v5i8_arg:
1790; CM:       ; %bb.0: ; %entry
1791; CM-NEXT:    ALU 0, @16, KC0[], KC1[]
1792; CM-NEXT:    TEX 4 @6
1793; CM-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1794; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
1795; CM-NEXT:    MEM_RAT MSKOR T5.XW, T7.X
1796; CM-NEXT:    CF_END
1797; CM-NEXT:    Fetch clause starting at 6:
1798; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1799; CM-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1800; CM-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1801; CM-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1802; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1803; CM-NEXT:    ALU clause starting at 16:
1804; CM-NEXT:     MOV * T5.X, 0.0,
1805; CM-NEXT:    ALU clause starting at 17:
1806; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1807; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1808; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1809; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1810; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1811; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1812; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1813; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1814; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1815; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1816; CM-NEXT:     MOV T5.Y, 0.0,
1817; CM-NEXT:     MOV T5.Z, 0.0,
1818; CM-NEXT:     AND_INT * T1.W, T9.X, literal.x,
1819; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1820; CM-NEXT:     AND_INT T0.Y, T8.X, literal.x,
1821; CM-NEXT:     LSHL T0.Z, PV.W, literal.y,
1822; CM-NEXT:     LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
1823; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1824; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1825; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
1826; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
1827; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1828; CM-NEXT:     LSHR T7.X, T0.W, literal.x,
1829; CM-NEXT:     OR_INT T0.Z, PV.Z, PV.W,
1830; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
1831; CM-NEXT:    2(2.802597e-45), 255(3.573311e-43)
1832; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1833; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
1834; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1835entry:
1836  store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
1837  ret void
1838}
1839
1840define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
1841; SI-LABEL: v5i16_arg:
1842; SI:       ; %bb.0: ; %entry
1843; SI-NEXT:    s_load_dword s2, s[0:1], 0xf
1844; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1845; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1846; SI-NEXT:    s_mov_b32 s7, 0xf000
1847; SI-NEXT:    s_mov_b32 s6, -1
1848; SI-NEXT:    s_waitcnt lgkmcnt(0)
1849; SI-NEXT:    v_mov_b32_e32 v0, s2
1850; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:8
1851; SI-NEXT:    s_waitcnt expcnt(0)
1852; SI-NEXT:    v_mov_b32_e32 v0, s0
1853; SI-NEXT:    v_mov_b32_e32 v1, s1
1854; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1855; SI-NEXT:    s_endpgm
1856;
1857; VI-LABEL: v5i16_arg:
1858; VI:       ; %bb.0: ; %entry
1859; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1860; VI-NEXT:    s_load_dword s5, s[0:1], 0x3c
1861; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1862; VI-NEXT:    s_waitcnt lgkmcnt(0)
1863; VI-NEXT:    s_add_u32 s4, s2, 8
1864; VI-NEXT:    v_mov_b32_e32 v4, s5
1865; VI-NEXT:    s_addc_u32 s5, s3, 0
1866; VI-NEXT:    v_mov_b32_e32 v2, s4
1867; VI-NEXT:    v_mov_b32_e32 v3, s5
1868; VI-NEXT:    v_mov_b32_e32 v0, s2
1869; VI-NEXT:    flat_store_short v[2:3], v4
1870; VI-NEXT:    v_mov_b32_e32 v3, s1
1871; VI-NEXT:    v_mov_b32_e32 v1, s3
1872; VI-NEXT:    v_mov_b32_e32 v2, s0
1873; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1874; VI-NEXT:    s_endpgm
1875;
1876; GFX9-LABEL: v5i16_arg:
1877; GFX9:       ; %bb.0: ; %entry
1878; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1879; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1880; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1881; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1882; GFX9-NEXT:    v_mov_b32_e32 v3, s2
1883; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1884; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1885; GFX9-NEXT:    global_store_short v2, v3, s[6:7] offset:8
1886; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
1887; GFX9-NEXT:    s_endpgm
1888;
1889; EG-LABEL: v5i16_arg:
1890; EG:       ; %bb.0: ; %entry
1891; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1892; EG-NEXT:    TEX 4 @10
1893; EG-NEXT:    ALU 65, @21, KC0[CB0:0-32], KC1[]
1894; EG-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1895; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1896; EG-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1897; EG-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1898; EG-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1899; EG-NEXT:    CF_END
1900; EG-NEXT:    PAD
1901; EG-NEXT:    Fetch clause starting at 10:
1902; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1903; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1904; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1905; EG-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1906; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1907; EG-NEXT:    ALU clause starting at 20:
1908; EG-NEXT:     MOV * T0.X, 0.0,
1909; EG-NEXT:    ALU clause starting at 21:
1910; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1911; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1912; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1913; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
1914; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1915; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1916; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1917; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1918; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1919; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1920; EG-NEXT:     MOV T5.Y, 0.0,
1921; EG-NEXT:     AND_INT T1.W, KC0[2].Y, literal.x,
1922; EG-NEXT:     AND_INT * T2.W, T4.X, literal.y,
1923; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1924; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1925; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1926; EG-NEXT:     LSHL T4.X, T2.W, PV.W,
1927; EG-NEXT:     LSHL * T4.W, literal.x, PV.W,
1928; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1929; EG-NEXT:     MOV T4.Y, 0.0,
1930; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
1931; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1932; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
1933; EG-NEXT:     AND_INT * T3.W, T3.X, literal.y,
1934; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1935; EG-NEXT:     LSHL * T2.W, PV.W, literal.x,
1936; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1937; EG-NEXT:     LSHL T3.X, T3.W, PV.W,
1938; EG-NEXT:     LSHL * T3.W, literal.x, PV.W,
1939; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1940; EG-NEXT:     MOV T3.Y, 0.0,
1941; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1942; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1943; EG-NEXT:     AND_INT T6.W, PV.W, literal.x,
1944; EG-NEXT:     AND_INT * T7.W, T2.X, literal.y,
1945; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1946; EG-NEXT:     LSHL * T6.W, PV.W, literal.x,
1947; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1948; EG-NEXT:     LSHL T6.X, T7.W, PV.W,
1949; EG-NEXT:     LSHL * T6.W, literal.x, PV.W,
1950; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1951; EG-NEXT:     MOV T6.Y, 0.0,
1952; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
1953; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
1954; EG-NEXT:     AND_INT T8.W, PV.W, literal.x,
1955; EG-NEXT:     AND_INT * T9.W, T1.X, literal.y,
1956; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1957; EG-NEXT:     LSHL * T8.W, PV.W, literal.x,
1958; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1959; EG-NEXT:     LSHL T8.X, T9.W, PV.W,
1960; EG-NEXT:     LSHL * T8.W, literal.x, PV.W,
1961; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1962; EG-NEXT:     MOV T8.Y, 0.0,
1963; EG-NEXT:     MOV T5.Z, 0.0,
1964; EG-NEXT:     MOV * T4.Z, 0.0,
1965; EG-NEXT:     MOV T3.Z, 0.0,
1966; EG-NEXT:     MOV * T6.Z, 0.0,
1967; EG-NEXT:     MOV * T8.Z, 0.0,
1968; EG-NEXT:     LSHR T0.X, T7.W, literal.x,
1969; EG-NEXT:     LSHR * T1.X, T2.W, literal.x,
1970; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1971; EG-NEXT:     LSHR T2.X, T1.W, literal.x,
1972; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1973; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1974; EG-NEXT:     LSHR * T9.X, T0.W, literal.x,
1975; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1976;
1977; CM-LABEL: v5i16_arg:
1978; CM:       ; %bb.0: ; %entry
1979; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
1980; CM-NEXT:    TEX 4 @10
1981; CM-NEXT:    ALU 67, @21, KC0[CB0:0-32], KC1[]
1982; CM-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1983; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1984; CM-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1985; CM-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1986; CM-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1987; CM-NEXT:    CF_END
1988; CM-NEXT:    PAD
1989; CM-NEXT:    Fetch clause starting at 10:
1990; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1991; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1992; CM-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1993; CM-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1994; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1995; CM-NEXT:    ALU clause starting at 20:
1996; CM-NEXT:     MOV * T0.X, 0.0,
1997; CM-NEXT:    ALU clause starting at 21:
1998; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1999; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2000; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
2001; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2002; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
2003; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
2004; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2005; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
2006; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
2007; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2008; CM-NEXT:     MOV T5.Y, 0.0,
2009; CM-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
2010; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2011; CM-NEXT:     AND_INT T0.Z, T4.X, literal.x,
2012; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
2013; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2014; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
2015; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
2016; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2017; CM-NEXT:     MOV T4.Y, 0.0,
2018; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2019; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2020; CM-NEXT:     AND_INT * T2.W, PV.W, literal.x,
2021; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2022; CM-NEXT:     AND_INT T0.Z, T3.X, literal.x,
2023; CM-NEXT:     LSHL * T2.W, PV.W, literal.y,
2024; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2025; CM-NEXT:     LSHL T3.X, PV.Z, PV.W,
2026; CM-NEXT:     LSHL * T3.W, literal.x, PV.W,
2027; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2028; CM-NEXT:     MOV T3.Y, 0.0,
2029; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2030; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
2031; CM-NEXT:     AND_INT * T6.W, PV.W, literal.x,
2032; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2033; CM-NEXT:     AND_INT T0.Z, T2.X, literal.x,
2034; CM-NEXT:     LSHL * T6.W, PV.W, literal.y,
2035; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2036; CM-NEXT:     LSHL T6.X, PV.Z, PV.W,
2037; CM-NEXT:     LSHL * T6.W, literal.x, PV.W,
2038; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2039; CM-NEXT:     MOV T6.Y, 0.0,
2040; CM-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
2041; CM-NEXT:    6(8.407791e-45), 0(0.000000e+00)
2042; CM-NEXT:     AND_INT * T8.W, PV.W, literal.x,
2043; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2044; CM-NEXT:     AND_INT T0.Z, T1.X, literal.x,
2045; CM-NEXT:     LSHL * T8.W, PV.W, literal.y,
2046; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2047; CM-NEXT:     LSHL T8.X, PV.Z, PV.W,
2048; CM-NEXT:     LSHL * T8.W, literal.x, PV.W,
2049; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2050; CM-NEXT:     MOV T8.Y, 0.0,
2051; CM-NEXT:     MOV * T5.Z, 0.0,
2052; CM-NEXT:     MOV * T4.Z, 0.0,
2053; CM-NEXT:     MOV * T3.Z, 0.0,
2054; CM-NEXT:     MOV * T6.Z, 0.0,
2055; CM-NEXT:     MOV * T8.Z, 0.0,
2056; CM-NEXT:     LSHR * T0.X, T7.W, literal.x,
2057; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2058; CM-NEXT:     LSHR * T1.X, T2.W, literal.x,
2059; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2060; CM-NEXT:     LSHR * T2.X, T1.W, literal.x,
2061; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2062; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
2063; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2064; CM-NEXT:     LSHR * T9.X, T0.W, literal.x,
2065; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2066entry:
2067  store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
2068  ret void
2069}
2070
2071define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
2072; SI-LABEL: v5i32_arg:
2073; SI:       ; %bb.0: ; %entry
2074; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
2075; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2076; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
2077; SI-NEXT:    s_mov_b32 s7, 0xf000
2078; SI-NEXT:    s_mov_b32 s6, -1
2079; SI-NEXT:    s_waitcnt lgkmcnt(0)
2080; SI-NEXT:    v_mov_b32_e32 v0, s8
2081; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:16
2082; SI-NEXT:    s_waitcnt expcnt(0)
2083; SI-NEXT:    v_mov_b32_e32 v0, s0
2084; SI-NEXT:    v_mov_b32_e32 v1, s1
2085; SI-NEXT:    v_mov_b32_e32 v2, s2
2086; SI-NEXT:    v_mov_b32_e32 v3, s3
2087; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2088; SI-NEXT:    s_endpgm
2089;
2090; VI-LABEL: v5i32_arg:
2091; VI:       ; %bb.0: ; %entry
2092; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2093; VI-NEXT:    s_load_dword s7, s[0:1], 0x54
2094; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
2095; VI-NEXT:    s_waitcnt lgkmcnt(0)
2096; VI-NEXT:    s_add_u32 s6, s4, 16
2097; VI-NEXT:    v_mov_b32_e32 v2, s7
2098; VI-NEXT:    s_addc_u32 s7, s5, 0
2099; VI-NEXT:    v_mov_b32_e32 v0, s6
2100; VI-NEXT:    v_mov_b32_e32 v1, s7
2101; VI-NEXT:    v_mov_b32_e32 v4, s4
2102; VI-NEXT:    flat_store_dword v[0:1], v2
2103; VI-NEXT:    v_mov_b32_e32 v0, s0
2104; VI-NEXT:    v_mov_b32_e32 v5, s5
2105; VI-NEXT:    v_mov_b32_e32 v1, s1
2106; VI-NEXT:    v_mov_b32_e32 v2, s2
2107; VI-NEXT:    v_mov_b32_e32 v3, s3
2108; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2109; VI-NEXT:    s_endpgm
2110;
2111; GFX9-LABEL: v5i32_arg:
2112; GFX9:       ; %bb.0: ; %entry
2113; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x30
2114; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2115; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2116; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2117; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2118; GFX9-NEXT:    v_mov_b32_e32 v5, s8
2119; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2120; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2121; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2122; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2123; GFX9-NEXT:    global_store_dword v4, v5, s[6:7] offset:16
2124; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2125; GFX9-NEXT:    s_endpgm
2126;
2127; EG-LABEL: v5i32_arg:
2128; EG:       ; %bb.0: ; %entry
2129; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2130; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2131; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2132; EG-NEXT:    CF_END
2133; EG-NEXT:    ALU clause starting at 4:
2134; EG-NEXT:     MOV * T0.W, KC0[5].X,
2135; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2136; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
2137; EG-NEXT:     MOV T0.X, KC0[4].Y,
2138; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2139; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2140; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2141; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2142; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2143; EG-NEXT:     MOV * T3.X, KC0[5].Y,
2144; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2145;
2146; CM-LABEL: v5i32_arg:
2147; CM:       ; %bb.0: ; %entry
2148; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2149; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2150; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2151; CM-NEXT:    CF_END
2152; CM-NEXT:    ALU clause starting at 4:
2153; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2154; CM-NEXT:     MOV * T0.W, KC0[5].X,
2155; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2156; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
2157; CM-NEXT:     MOV * T0.Z, KC0[4].W,
2158; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2159; CM-NEXT:     MOV T2.X, KC0[5].Y,
2160; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
2161; CM-NEXT:     MOV * T0.X, KC0[4].Y,
2162; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2163; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2164entry:
2165  store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
2166  ret void
2167}
2168
2169define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
2170; SI-LABEL: v5f32_arg:
2171; SI:       ; %bb.0: ; %entry
2172; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
2173; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2174; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
2175; SI-NEXT:    s_mov_b32 s7, 0xf000
2176; SI-NEXT:    s_mov_b32 s6, -1
2177; SI-NEXT:    s_waitcnt lgkmcnt(0)
2178; SI-NEXT:    v_mov_b32_e32 v0, s8
2179; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:16
2180; SI-NEXT:    s_waitcnt expcnt(0)
2181; SI-NEXT:    v_mov_b32_e32 v0, s0
2182; SI-NEXT:    v_mov_b32_e32 v1, s1
2183; SI-NEXT:    v_mov_b32_e32 v2, s2
2184; SI-NEXT:    v_mov_b32_e32 v3, s3
2185; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2186; SI-NEXT:    s_endpgm
2187;
2188; VI-LABEL: v5f32_arg:
2189; VI:       ; %bb.0: ; %entry
2190; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2191; VI-NEXT:    s_load_dword s7, s[0:1], 0x54
2192; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x44
2193; VI-NEXT:    s_waitcnt lgkmcnt(0)
2194; VI-NEXT:    s_add_u32 s6, s4, 16
2195; VI-NEXT:    v_mov_b32_e32 v3, s7
2196; VI-NEXT:    s_addc_u32 s7, s5, 0
2197; VI-NEXT:    v_mov_b32_e32 v1, s6
2198; VI-NEXT:    v_mov_b32_e32 v2, s7
2199; VI-NEXT:    v_mov_b32_e32 v4, s4
2200; VI-NEXT:    v_mov_b32_e32 v0, s0
2201; VI-NEXT:    flat_store_dword v[1:2], v3
2202; VI-NEXT:    v_mov_b32_e32 v1, s1
2203; VI-NEXT:    v_mov_b32_e32 v2, s2
2204; VI-NEXT:    v_mov_b32_e32 v3, s3
2205; VI-NEXT:    v_mov_b32_e32 v5, s5
2206; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2207; VI-NEXT:    s_endpgm
2208;
2209; GFX9-LABEL: v5f32_arg:
2210; GFX9:       ; %bb.0: ; %entry
2211; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2212; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2213; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2214; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x30
2215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2216; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2217; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2218; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2219; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2220; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2221; GFX9-NEXT:    s_nop 0
2222; GFX9-NEXT:    v_mov_b32_e32 v0, s4
2223; GFX9-NEXT:    global_store_dword v4, v0, s[6:7] offset:16
2224; GFX9-NEXT:    s_endpgm
2225;
2226; EG-LABEL: v5f32_arg:
2227; EG:       ; %bb.0: ; %entry
2228; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2229; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2230; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2231; EG-NEXT:    CF_END
2232; EG-NEXT:    ALU clause starting at 4:
2233; EG-NEXT:     MOV * T0.W, KC0[5].X,
2234; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2235; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
2236; EG-NEXT:     MOV T0.X, KC0[4].Y,
2237; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2238; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2239; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2240; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2241; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2242; EG-NEXT:     MOV * T3.X, KC0[5].Y,
2243; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2244;
2245; CM-LABEL: v5f32_arg:
2246; CM:       ; %bb.0: ; %entry
2247; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2248; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2249; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2250; CM-NEXT:    CF_END
2251; CM-NEXT:    ALU clause starting at 4:
2252; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2253; CM-NEXT:     MOV * T0.W, KC0[5].X,
2254; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2255; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
2256; CM-NEXT:     MOV * T0.Z, KC0[4].W,
2257; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2258; CM-NEXT:     MOV T2.X, KC0[5].Y,
2259; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
2260; CM-NEXT:     MOV * T0.X, KC0[4].Y,
2261; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2262; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2263entry:
2264  store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
2265  ret void
2266}
2267
2268define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
2269; SI-LABEL: v5i64_arg:
2270; SI:       ; %bb.0: ; %entry
2271; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
2272; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
2273; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
2274; SI-NEXT:    s_mov_b32 s15, 0xf000
2275; SI-NEXT:    s_mov_b32 s14, -1
2276; SI-NEXT:    s_waitcnt lgkmcnt(0)
2277; SI-NEXT:    v_mov_b32_e32 v0, s8
2278; SI-NEXT:    v_mov_b32_e32 v1, s9
2279; SI-NEXT:    v_mov_b32_e32 v2, s10
2280; SI-NEXT:    v_mov_b32_e32 v3, s11
2281; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2282; SI-NEXT:    s_waitcnt expcnt(0)
2283; SI-NEXT:    v_mov_b32_e32 v0, s4
2284; SI-NEXT:    v_mov_b32_e32 v1, s5
2285; SI-NEXT:    v_mov_b32_e32 v2, s6
2286; SI-NEXT:    v_mov_b32_e32 v3, s7
2287; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2288; SI-NEXT:    s_waitcnt expcnt(0)
2289; SI-NEXT:    v_mov_b32_e32 v0, s0
2290; SI-NEXT:    v_mov_b32_e32 v1, s1
2291; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2292; SI-NEXT:    s_endpgm
2293;
2294; VI-LABEL: v5i64_arg:
2295; VI:       ; %bb.0: ; %entry
2296; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
2297; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2298; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
2299; VI-NEXT:    s_waitcnt lgkmcnt(0)
2300; VI-NEXT:    v_mov_b32_e32 v0, s8
2301; VI-NEXT:    s_add_u32 s8, s2, 16
2302; VI-NEXT:    v_mov_b32_e32 v1, s9
2303; VI-NEXT:    s_addc_u32 s9, s3, 0
2304; VI-NEXT:    v_mov_b32_e32 v4, s8
2305; VI-NEXT:    v_mov_b32_e32 v2, s10
2306; VI-NEXT:    v_mov_b32_e32 v3, s11
2307; VI-NEXT:    v_mov_b32_e32 v5, s9
2308; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2309; VI-NEXT:    v_mov_b32_e32 v5, s3
2310; VI-NEXT:    v_mov_b32_e32 v0, s4
2311; VI-NEXT:    v_mov_b32_e32 v1, s5
2312; VI-NEXT:    v_mov_b32_e32 v2, s6
2313; VI-NEXT:    v_mov_b32_e32 v3, s7
2314; VI-NEXT:    v_mov_b32_e32 v4, s2
2315; VI-NEXT:    s_add_u32 s2, s2, 32
2316; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2317; VI-NEXT:    s_addc_u32 s3, s3, 0
2318; VI-NEXT:    v_mov_b32_e32 v2, s2
2319; VI-NEXT:    v_mov_b32_e32 v0, s0
2320; VI-NEXT:    v_mov_b32_e32 v1, s1
2321; VI-NEXT:    v_mov_b32_e32 v3, s3
2322; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2323; VI-NEXT:    s_endpgm
2324;
2325; GFX9-LABEL: v5i64_arg:
2326; GFX9:       ; %bb.0: ; %entry
2327; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
2328; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2329; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
2330; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2331; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2332; GFX9-NEXT:    v_mov_b32_e32 v0, s12
2333; GFX9-NEXT:    v_mov_b32_e32 v1, s13
2334; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2335; GFX9-NEXT:    v_mov_b32_e32 v3, s15
2336; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
2337; GFX9-NEXT:    s_nop 0
2338; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2339; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2340; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2341; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2342; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2343; GFX9-NEXT:    s_nop 0
2344; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2345; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2346; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
2347; GFX9-NEXT:    s_endpgm
2348;
2349; EG-LABEL: v5i64_arg:
2350; EG:       ; %bb.0: ; %entry
2351; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2352; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2353; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2354; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2355; EG-NEXT:    CF_END
2356; EG-NEXT:    PAD
2357; EG-NEXT:    ALU clause starting at 6:
2358; EG-NEXT:     MOV * T0.W, KC0[7].X,
2359; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2360; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2361; EG-NEXT:     MOV * T1.W, KC0[8].X,
2362; EG-NEXT:     MOV T0.X, KC0[6].Y,
2363; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2364; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2365; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2366; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2367; EG-NEXT:     MOV T1.X, KC0[7].Y,
2368; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2369; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2370; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2371; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2372; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2373; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2374; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2375; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2376; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2377;
2378; CM-LABEL: v5i64_arg:
2379; CM:       ; %bb.0: ; %entry
2380; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2381; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2382; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2383; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2384; CM-NEXT:    CF_END
2385; CM-NEXT:    PAD
2386; CM-NEXT:    ALU clause starting at 6:
2387; CM-NEXT:     MOV * T0.W, KC0[8].X,
2388; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2389; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2390; CM-NEXT:     MOV T1.X, KC0[8].Y,
2391; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2392; CM-NEXT:     MOV T0.X, KC0[7].Y,
2393; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2394; CM-NEXT:     MOV * T2.W, KC0[7].X,
2395; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2396; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2397; CM-NEXT:     MOV T2.Z, KC0[6].W,
2398; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2399; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2400; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2401; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2402; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2403; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2404; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2405; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2406entry:
2407  store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
2408  ret void
2409}
2410
2411define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
2412; SI-LABEL: v5f64_arg:
2413; SI:       ; %bb.0: ; %entry
2414; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
2415; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
2416; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
2417; SI-NEXT:    s_mov_b32 s15, 0xf000
2418; SI-NEXT:    s_mov_b32 s14, -1
2419; SI-NEXT:    s_waitcnt lgkmcnt(0)
2420; SI-NEXT:    v_mov_b32_e32 v0, s8
2421; SI-NEXT:    v_mov_b32_e32 v1, s9
2422; SI-NEXT:    v_mov_b32_e32 v2, s10
2423; SI-NEXT:    v_mov_b32_e32 v3, s11
2424; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2425; SI-NEXT:    s_waitcnt expcnt(0)
2426; SI-NEXT:    v_mov_b32_e32 v0, s4
2427; SI-NEXT:    v_mov_b32_e32 v1, s5
2428; SI-NEXT:    v_mov_b32_e32 v2, s6
2429; SI-NEXT:    v_mov_b32_e32 v3, s7
2430; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2431; SI-NEXT:    s_waitcnt expcnt(0)
2432; SI-NEXT:    v_mov_b32_e32 v0, s0
2433; SI-NEXT:    v_mov_b32_e32 v1, s1
2434; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2435; SI-NEXT:    s_endpgm
2436;
2437; VI-LABEL: v5f64_arg:
2438; VI:       ; %bb.0: ; %entry
2439; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
2440; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2441; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
2442; VI-NEXT:    s_waitcnt lgkmcnt(0)
2443; VI-NEXT:    v_mov_b32_e32 v0, s8
2444; VI-NEXT:    s_add_u32 s8, s2, 16
2445; VI-NEXT:    v_mov_b32_e32 v1, s9
2446; VI-NEXT:    s_addc_u32 s9, s3, 0
2447; VI-NEXT:    v_mov_b32_e32 v4, s8
2448; VI-NEXT:    v_mov_b32_e32 v2, s10
2449; VI-NEXT:    v_mov_b32_e32 v3, s11
2450; VI-NEXT:    v_mov_b32_e32 v5, s9
2451; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2452; VI-NEXT:    v_mov_b32_e32 v5, s3
2453; VI-NEXT:    v_mov_b32_e32 v0, s4
2454; VI-NEXT:    v_mov_b32_e32 v1, s5
2455; VI-NEXT:    v_mov_b32_e32 v2, s6
2456; VI-NEXT:    v_mov_b32_e32 v3, s7
2457; VI-NEXT:    v_mov_b32_e32 v4, s2
2458; VI-NEXT:    s_add_u32 s2, s2, 32
2459; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2460; VI-NEXT:    s_addc_u32 s3, s3, 0
2461; VI-NEXT:    v_mov_b32_e32 v2, s2
2462; VI-NEXT:    v_mov_b32_e32 v0, s0
2463; VI-NEXT:    v_mov_b32_e32 v1, s1
2464; VI-NEXT:    v_mov_b32_e32 v3, s3
2465; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2466; VI-NEXT:    s_endpgm
2467;
2468; GFX9-LABEL: v5f64_arg:
2469; GFX9:       ; %bb.0: ; %entry
2470; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
2471; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2472; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
2473; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2474; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2475; GFX9-NEXT:    v_mov_b32_e32 v0, s12
2476; GFX9-NEXT:    v_mov_b32_e32 v1, s13
2477; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2478; GFX9-NEXT:    v_mov_b32_e32 v3, s15
2479; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
2480; GFX9-NEXT:    s_nop 0
2481; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2482; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2483; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2484; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2485; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2486; GFX9-NEXT:    s_nop 0
2487; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2488; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2489; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
2490; GFX9-NEXT:    s_endpgm
2491;
2492; EG-LABEL: v5f64_arg:
2493; EG:       ; %bb.0: ; %entry
2494; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2495; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2496; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2497; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2498; EG-NEXT:    CF_END
2499; EG-NEXT:    PAD
2500; EG-NEXT:    ALU clause starting at 6:
2501; EG-NEXT:     MOV * T0.W, KC0[7].X,
2502; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2503; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2504; EG-NEXT:     MOV * T1.W, KC0[8].X,
2505; EG-NEXT:     MOV T0.X, KC0[6].Y,
2506; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2507; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2508; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2509; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2510; EG-NEXT:     MOV T1.X, KC0[7].Y,
2511; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2512; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2513; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2514; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2515; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2516; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2517; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2518; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2519; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2520;
2521; CM-LABEL: v5f64_arg:
2522; CM:       ; %bb.0: ; %entry
2523; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2524; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2525; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2526; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2527; CM-NEXT:    CF_END
2528; CM-NEXT:    PAD
2529; CM-NEXT:    ALU clause starting at 6:
2530; CM-NEXT:     MOV * T0.W, KC0[8].X,
2531; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2532; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2533; CM-NEXT:     MOV T1.X, KC0[8].Y,
2534; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2535; CM-NEXT:     MOV T0.X, KC0[7].Y,
2536; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2537; CM-NEXT:     MOV * T2.W, KC0[7].X,
2538; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2539; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2540; CM-NEXT:     MOV T2.Z, KC0[6].W,
2541; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2542; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2543; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2544; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2545; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2546; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2547; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2548; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2549entry:
2550  store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
2551  ret void
2552}
2553
2554; FIXME: Lots of unpack and re-pack junk on VI
2555define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
2556; SI-LABEL: v8i8_arg:
2557; SI:       ; %bb.0: ; %entry
2558; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
2559; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2560; SI-NEXT:    s_mov_b32 s3, 0xf000
2561; SI-NEXT:    s_mov_b32 s2, -1
2562; SI-NEXT:    s_waitcnt lgkmcnt(0)
2563; SI-NEXT:    v_mov_b32_e32 v0, s4
2564; SI-NEXT:    v_mov_b32_e32 v1, s5
2565; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2566; SI-NEXT:    s_endpgm
2567;
2568; VI-LABEL: v8i8_arg:
2569; VI:       ; %bb.0: ; %entry
2570; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2571; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
2572; VI-NEXT:    s_waitcnt lgkmcnt(0)
2573; VI-NEXT:    v_mov_b32_e32 v0, s2
2574; VI-NEXT:    v_mov_b32_e32 v3, s1
2575; VI-NEXT:    v_mov_b32_e32 v1, s3
2576; VI-NEXT:    v_mov_b32_e32 v2, s0
2577; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2578; VI-NEXT:    s_endpgm
2579;
2580; GFX9-LABEL: v8i8_arg:
2581; GFX9:       ; %bb.0: ; %entry
2582; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
2583; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
2584; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2585; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2586; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2587; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2588; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
2589; GFX9-NEXT:    s_endpgm
2590;
2591; EG-LABEL: v8i8_arg:
2592; EG:       ; %bb.0: ; %entry
2593; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2594; EG-NEXT:    TEX 0 @20
2595; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2596; EG-NEXT:    TEX 0 @22
2597; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2598; EG-NEXT:    TEX 0 @24
2599; EG-NEXT:    ALU 7, @50, KC0[], KC1[]
2600; EG-NEXT:    TEX 0 @26
2601; EG-NEXT:    ALU 7, @58, KC0[], KC1[]
2602; EG-NEXT:    TEX 0 @28
2603; EG-NEXT:    ALU 7, @66, KC0[], KC1[]
2604; EG-NEXT:    TEX 0 @30
2605; EG-NEXT:    ALU 7, @74, KC0[], KC1[]
2606; EG-NEXT:    TEX 0 @32
2607; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
2608; EG-NEXT:    TEX 0 @34
2609; EG-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2610; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
2611; EG-NEXT:    CF_END
2612; EG-NEXT:    PAD
2613; EG-NEXT:    Fetch clause starting at 20:
2614; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2615; EG-NEXT:    Fetch clause starting at 22:
2616; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2617; EG-NEXT:    Fetch clause starting at 24:
2618; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2619; EG-NEXT:    Fetch clause starting at 26:
2620; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2621; EG-NEXT:    Fetch clause starting at 28:
2622; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2623; EG-NEXT:    Fetch clause starting at 30:
2624; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2625; EG-NEXT:    Fetch clause starting at 32:
2626; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2627; EG-NEXT:    Fetch clause starting at 34:
2628; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2629; EG-NEXT:    ALU clause starting at 36:
2630; EG-NEXT:     MOV * T0.Y, T2.X,
2631; EG-NEXT:     MOV * T5.X, 0.0,
2632; EG-NEXT:    ALU clause starting at 38:
2633; EG-NEXT:     LSHL T0.W, T6.X, literal.x,
2634; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2635; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2636; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2637; EG-NEXT:     MOV T2.X, PV.W,
2638; EG-NEXT:     MOV * T0.Y, T3.X,
2639; EG-NEXT:    ALU clause starting at 44:
2640; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2641; EG-NEXT:     LSHL * T1.W, T6.X, literal.y,
2642; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2643; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2644; EG-NEXT:     MOV T3.X, PV.W,
2645; EG-NEXT:     MOV * T0.Y, T2.X,
2646; EG-NEXT:    ALU clause starting at 50:
2647; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2648; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2649; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2650; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2651; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2652; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2653; EG-NEXT:     MOV T2.X, PV.W,
2654; EG-NEXT:     MOV * T0.Y, T3.X,
2655; EG-NEXT:    ALU clause starting at 58:
2656; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2657; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2658; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2659; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2660; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2661; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2662; EG-NEXT:     MOV T3.X, PV.W,
2663; EG-NEXT:     MOV * T0.Y, T2.X,
2664; EG-NEXT:    ALU clause starting at 66:
2665; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2666; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2667; EG-NEXT:    255(3.573311e-43), -65281(nan)
2668; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2669; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2670; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2671; EG-NEXT:     MOV T2.X, PV.W,
2672; EG-NEXT:     MOV * T0.Y, T3.X,
2673; EG-NEXT:    ALU clause starting at 74:
2674; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2675; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2676; EG-NEXT:    255(3.573311e-43), -65281(nan)
2677; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2678; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2679; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2680; EG-NEXT:     MOV T3.X, PV.W,
2681; EG-NEXT:     MOV * T0.Y, T2.X,
2682; EG-NEXT:    ALU clause starting at 82:
2683; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2684; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
2685; EG-NEXT:    -256(nan), 255(3.573311e-43)
2686; EG-NEXT:     OR_INT * T5.Y, PV.W, PS,
2687; EG-NEXT:     MOV T2.X, PV.Y,
2688; EG-NEXT:     MOV * T0.Y, T3.X,
2689; EG-NEXT:    ALU clause starting at 88:
2690; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2691; EG-NEXT:     AND_INT * T1.W, T5.X, literal.y,
2692; EG-NEXT:    -256(nan), 255(3.573311e-43)
2693; EG-NEXT:     OR_INT T5.X, PV.W, PS,
2694; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2695; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2696;
2697; CM-LABEL: v8i8_arg:
2698; CM:       ; %bb.0: ; %entry
2699; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2700; CM-NEXT:    TEX 0 @20
2701; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2702; CM-NEXT:    TEX 0 @22
2703; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2704; CM-NEXT:    TEX 0 @24
2705; CM-NEXT:    ALU 7, @50, KC0[], KC1[]
2706; CM-NEXT:    TEX 0 @26
2707; CM-NEXT:    ALU 7, @58, KC0[], KC1[]
2708; CM-NEXT:    TEX 0 @28
2709; CM-NEXT:    ALU 7, @66, KC0[], KC1[]
2710; CM-NEXT:    TEX 0 @30
2711; CM-NEXT:    ALU 7, @74, KC0[], KC1[]
2712; CM-NEXT:    TEX 0 @32
2713; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
2714; CM-NEXT:    TEX 0 @34
2715; CM-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2716; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
2717; CM-NEXT:    CF_END
2718; CM-NEXT:    PAD
2719; CM-NEXT:    Fetch clause starting at 20:
2720; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2721; CM-NEXT:    Fetch clause starting at 22:
2722; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2723; CM-NEXT:    Fetch clause starting at 24:
2724; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2725; CM-NEXT:    Fetch clause starting at 26:
2726; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2727; CM-NEXT:    Fetch clause starting at 28:
2728; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2729; CM-NEXT:    Fetch clause starting at 30:
2730; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2731; CM-NEXT:    Fetch clause starting at 32:
2732; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2733; CM-NEXT:    Fetch clause starting at 34:
2734; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2735; CM-NEXT:    ALU clause starting at 36:
2736; CM-NEXT:     MOV * T0.Y, T2.X,
2737; CM-NEXT:     MOV * T5.X, 0.0,
2738; CM-NEXT:    ALU clause starting at 38:
2739; CM-NEXT:     LSHL T0.Z, T6.X, literal.x,
2740; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2741; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2742; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2743; CM-NEXT:     MOV T2.X, PV.W,
2744; CM-NEXT:     MOV * T0.Y, T3.X,
2745; CM-NEXT:    ALU clause starting at 44:
2746; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2747; CM-NEXT:     LSHL * T0.W, T6.X, literal.y,
2748; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2749; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2750; CM-NEXT:     MOV T3.X, PV.W,
2751; CM-NEXT:     MOV * T0.Y, T2.X,
2752; CM-NEXT:    ALU clause starting at 50:
2753; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2754; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2755; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2756; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2757; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2758; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2759; CM-NEXT:     MOV T2.X, PV.W,
2760; CM-NEXT:     MOV * T0.Y, T3.X,
2761; CM-NEXT:    ALU clause starting at 58:
2762; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2763; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2764; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2765; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2766; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2767; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2768; CM-NEXT:     MOV T3.X, PV.W,
2769; CM-NEXT:     MOV * T0.Y, T2.X,
2770; CM-NEXT:    ALU clause starting at 66:
2771; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2772; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2773; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2774; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2775; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2776; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2777; CM-NEXT:     MOV T2.X, PV.W,
2778; CM-NEXT:     MOV * T0.Y, T3.X,
2779; CM-NEXT:    ALU clause starting at 74:
2780; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2781; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2782; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2783; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2784; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2785; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2786; CM-NEXT:     MOV T3.X, PV.W,
2787; CM-NEXT:     MOV * T0.Y, T2.X,
2788; CM-NEXT:    ALU clause starting at 82:
2789; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2790; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
2791; CM-NEXT:    -256(nan), 255(3.573311e-43)
2792; CM-NEXT:     OR_INT * T5.Y, PV.Z, PV.W,
2793; CM-NEXT:     MOV T2.X, PV.Y,
2794; CM-NEXT:     MOV * T0.Y, T3.X,
2795; CM-NEXT:    ALU clause starting at 88:
2796; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2797; CM-NEXT:     AND_INT * T0.W, T5.X, literal.y,
2798; CM-NEXT:    -256(nan), 255(3.573311e-43)
2799; CM-NEXT:     OR_INT * T5.X, PV.Z, PV.W,
2800; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2801; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2802entry:
2803  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
2804  ret void
2805}
2806
2807define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
2808; SI-LABEL: v8i16_arg:
2809; SI:       ; %bb.0: ; %entry
2810; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
2811; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2812; SI-NEXT:    s_mov_b32 s3, 0xf000
2813; SI-NEXT:    s_mov_b32 s2, -1
2814; SI-NEXT:    s_waitcnt lgkmcnt(0)
2815; SI-NEXT:    v_mov_b32_e32 v0, s4
2816; SI-NEXT:    v_mov_b32_e32 v1, s5
2817; SI-NEXT:    v_mov_b32_e32 v2, s6
2818; SI-NEXT:    v_mov_b32_e32 v3, s7
2819; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2820; SI-NEXT:    s_endpgm
2821;
2822; VI-LABEL: v8i16_arg:
2823; VI:       ; %bb.0: ; %entry
2824; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2825; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
2826; VI-NEXT:    s_waitcnt lgkmcnt(0)
2827; VI-NEXT:    v_mov_b32_e32 v4, s4
2828; VI-NEXT:    v_mov_b32_e32 v0, s0
2829; VI-NEXT:    v_mov_b32_e32 v5, s5
2830; VI-NEXT:    v_mov_b32_e32 v1, s1
2831; VI-NEXT:    v_mov_b32_e32 v2, s2
2832; VI-NEXT:    v_mov_b32_e32 v3, s3
2833; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2834; VI-NEXT:    s_endpgm
2835;
2836; GFX9-LABEL: v8i16_arg:
2837; GFX9:       ; %bb.0: ; %entry
2838; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
2839; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2840; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2841; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2843; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2844; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2845; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2846; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2847; GFX9-NEXT:    s_endpgm
2848;
2849; EG-LABEL: v8i16_arg:
2850; EG:       ; %bb.0: ; %entry
2851; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2852; EG-NEXT:    TEX 0 @20
2853; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2854; EG-NEXT:    TEX 0 @22
2855; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2856; EG-NEXT:    TEX 0 @24
2857; EG-NEXT:    ALU 5, @50, KC0[], KC1[]
2858; EG-NEXT:    TEX 0 @26
2859; EG-NEXT:    ALU 5, @56, KC0[], KC1[]
2860; EG-NEXT:    TEX 0 @28
2861; EG-NEXT:    ALU 5, @62, KC0[], KC1[]
2862; EG-NEXT:    TEX 0 @30
2863; EG-NEXT:    ALU 5, @68, KC0[], KC1[]
2864; EG-NEXT:    TEX 0 @32
2865; EG-NEXT:    ALU 5, @74, KC0[], KC1[]
2866; EG-NEXT:    TEX 0 @34
2867; EG-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2868; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2869; EG-NEXT:    CF_END
2870; EG-NEXT:    PAD
2871; EG-NEXT:    Fetch clause starting at 20:
2872; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2873; EG-NEXT:    Fetch clause starting at 22:
2874; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2875; EG-NEXT:    Fetch clause starting at 24:
2876; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2877; EG-NEXT:    Fetch clause starting at 26:
2878; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2879; EG-NEXT:    Fetch clause starting at 28:
2880; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2881; EG-NEXT:    Fetch clause starting at 30:
2882; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2883; EG-NEXT:    Fetch clause starting at 32:
2884; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2885; EG-NEXT:    Fetch clause starting at 34:
2886; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2887; EG-NEXT:    ALU clause starting at 36:
2888; EG-NEXT:     MOV * T0.Y, T3.X,
2889; EG-NEXT:     MOV * T7.X, 0.0,
2890; EG-NEXT:    ALU clause starting at 38:
2891; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
2892; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2893; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2894; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2895; EG-NEXT:     MOV T3.X, PV.W,
2896; EG-NEXT:     MOV * T0.Y, T5.X,
2897; EG-NEXT:    ALU clause starting at 44:
2898; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2899; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2900; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2901; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2902; EG-NEXT:     MOV T5.X, PV.W,
2903; EG-NEXT:     MOV * T0.Y, T3.X,
2904; EG-NEXT:    ALU clause starting at 50:
2905; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2906; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2907; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2908; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2909; EG-NEXT:     MOV T3.X, PV.W,
2910; EG-NEXT:     MOV * T0.Y, T5.X,
2911; EG-NEXT:    ALU clause starting at 56:
2912; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2913; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2914; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2915; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2916; EG-NEXT:     MOV T5.X, PV.W,
2917; EG-NEXT:     MOV * T0.Y, T2.X,
2918; EG-NEXT:    ALU clause starting at 62:
2919; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2920; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2921; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2922; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2923; EG-NEXT:     MOV T2.X, PV.W,
2924; EG-NEXT:     MOV * T0.Y, T4.X,
2925; EG-NEXT:    ALU clause starting at 68:
2926; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2927; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2928; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2929; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2930; EG-NEXT:     MOV T4.X, PV.W,
2931; EG-NEXT:     MOV * T0.Y, T2.X,
2932; EG-NEXT:    ALU clause starting at 74:
2933; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2934; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2935; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2936; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
2937; EG-NEXT:     MOV T2.X, PV.Z,
2938; EG-NEXT:     MOV * T0.Y, T4.X,
2939; EG-NEXT:    ALU clause starting at 80:
2940; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
2941; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
2942; EG-NEXT:     AND_INT * T1.W, T7.X, literal.z,
2943; EG-NEXT:    2(2.802597e-45), -65536(nan)
2944; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2945; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
2946; EG-NEXT:     MOV T4.X, PV.X,
2947; EG-NEXT:     MOV * T7.W, T3.X,
2948; EG-NEXT:     MOV * T7.Y, T5.X,
2949;
2950; CM-LABEL: v8i16_arg:
2951; CM:       ; %bb.0: ; %entry
2952; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2953; CM-NEXT:    TEX 0 @20
2954; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2955; CM-NEXT:    TEX 0 @22
2956; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2957; CM-NEXT:    TEX 0 @24
2958; CM-NEXT:    ALU 5, @50, KC0[], KC1[]
2959; CM-NEXT:    TEX 0 @26
2960; CM-NEXT:    ALU 5, @56, KC0[], KC1[]
2961; CM-NEXT:    TEX 0 @28
2962; CM-NEXT:    ALU 5, @62, KC0[], KC1[]
2963; CM-NEXT:    TEX 0 @30
2964; CM-NEXT:    ALU 5, @68, KC0[], KC1[]
2965; CM-NEXT:    TEX 0 @32
2966; CM-NEXT:    ALU 5, @74, KC0[], KC1[]
2967; CM-NEXT:    TEX 0 @34
2968; CM-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2969; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
2970; CM-NEXT:    CF_END
2971; CM-NEXT:    PAD
2972; CM-NEXT:    Fetch clause starting at 20:
2973; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2974; CM-NEXT:    Fetch clause starting at 22:
2975; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2976; CM-NEXT:    Fetch clause starting at 24:
2977; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2978; CM-NEXT:    Fetch clause starting at 26:
2979; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2980; CM-NEXT:    Fetch clause starting at 28:
2981; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2982; CM-NEXT:    Fetch clause starting at 30:
2983; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2984; CM-NEXT:    Fetch clause starting at 32:
2985; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2986; CM-NEXT:    Fetch clause starting at 34:
2987; CM-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2988; CM-NEXT:    ALU clause starting at 36:
2989; CM-NEXT:     MOV * T0.Y, T3.X,
2990; CM-NEXT:     MOV * T7.X, 0.0,
2991; CM-NEXT:    ALU clause starting at 38:
2992; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
2993; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2994; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2995; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2996; CM-NEXT:     MOV T3.X, PV.W,
2997; CM-NEXT:     MOV * T0.Y, T5.X,
2998; CM-NEXT:    ALU clause starting at 44:
2999; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3000; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3001; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3002; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3003; CM-NEXT:     MOV T5.X, PV.W,
3004; CM-NEXT:     MOV * T0.Y, T3.X,
3005; CM-NEXT:    ALU clause starting at 50:
3006; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3007; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3008; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3009; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3010; CM-NEXT:     MOV T3.X, PV.W,
3011; CM-NEXT:     MOV * T0.Y, T5.X,
3012; CM-NEXT:    ALU clause starting at 56:
3013; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3014; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3015; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3016; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3017; CM-NEXT:     MOV T5.X, PV.W,
3018; CM-NEXT:     MOV * T0.Y, T2.X,
3019; CM-NEXT:    ALU clause starting at 62:
3020; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3021; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3022; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3023; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3024; CM-NEXT:     MOV T2.X, PV.W,
3025; CM-NEXT:     MOV * T0.Y, T4.X,
3026; CM-NEXT:    ALU clause starting at 68:
3027; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3028; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3029; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3030; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3031; CM-NEXT:     MOV T4.X, PV.W,
3032; CM-NEXT:     MOV * T0.Y, T2.X,
3033; CM-NEXT:    ALU clause starting at 74:
3034; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3035; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3036; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3037; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
3038; CM-NEXT:     MOV T2.X, PV.Z,
3039; CM-NEXT:     MOV * T0.Y, T4.X,
3040; CM-NEXT:    ALU clause starting at 80:
3041; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
3042; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
3043; CM-NEXT:     AND_INT * T0.W, T7.X, literal.z,
3044; CM-NEXT:    2(2.802597e-45), -65536(nan)
3045; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3046; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
3047; CM-NEXT:     MOV T4.X, PV.X,
3048; CM-NEXT:     MOV * T7.W, T3.X,
3049; CM-NEXT:     MOV * T7.Y, T5.X,
3050entry:
3051  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
3052  ret void
3053}
3054
3055define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
3056; SI-LABEL: v8i32_arg:
3057; SI:       ; %bb.0: ; %entry
3058; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3059; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3060; SI-NEXT:    s_mov_b32 s3, 0xf000
3061; SI-NEXT:    s_mov_b32 s2, -1
3062; SI-NEXT:    s_waitcnt lgkmcnt(0)
3063; SI-NEXT:    v_mov_b32_e32 v0, s8
3064; SI-NEXT:    v_mov_b32_e32 v1, s9
3065; SI-NEXT:    v_mov_b32_e32 v2, s10
3066; SI-NEXT:    v_mov_b32_e32 v3, s11
3067; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3068; SI-NEXT:    s_waitcnt expcnt(0)
3069; SI-NEXT:    v_mov_b32_e32 v0, s4
3070; SI-NEXT:    v_mov_b32_e32 v1, s5
3071; SI-NEXT:    v_mov_b32_e32 v2, s6
3072; SI-NEXT:    v_mov_b32_e32 v3, s7
3073; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3074; SI-NEXT:    s_endpgm
3075;
3076; VI-LABEL: v8i32_arg:
3077; VI:       ; %bb.0: ; %entry
3078; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
3079; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3080; VI-NEXT:    s_waitcnt lgkmcnt(0)
3081; VI-NEXT:    v_mov_b32_e32 v0, s8
3082; VI-NEXT:    s_add_u32 s2, s0, 16
3083; VI-NEXT:    s_addc_u32 s3, s1, 0
3084; VI-NEXT:    v_mov_b32_e32 v5, s3
3085; VI-NEXT:    v_mov_b32_e32 v1, s9
3086; VI-NEXT:    v_mov_b32_e32 v2, s10
3087; VI-NEXT:    v_mov_b32_e32 v3, s11
3088; VI-NEXT:    v_mov_b32_e32 v4, s2
3089; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3090; VI-NEXT:    v_mov_b32_e32 v5, s1
3091; VI-NEXT:    v_mov_b32_e32 v0, s4
3092; VI-NEXT:    v_mov_b32_e32 v1, s5
3093; VI-NEXT:    v_mov_b32_e32 v2, s6
3094; VI-NEXT:    v_mov_b32_e32 v3, s7
3095; VI-NEXT:    v_mov_b32_e32 v4, s0
3096; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3097; VI-NEXT:    s_endpgm
3098;
3099; GFX9-LABEL: v8i32_arg:
3100; GFX9:       ; %bb.0: ; %entry
3101; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3102; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3103; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3104; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3105; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3106; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3107; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3108; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3109; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3110; GFX9-NEXT:    s_nop 0
3111; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3112; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3113; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3114; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3115; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3116; GFX9-NEXT:    s_endpgm
3117;
3118; EG-LABEL: v8i32_arg:
3119; EG:       ; %bb.0: ; %entry
3120; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3123; EG-NEXT:    CF_END
3124; EG-NEXT:    ALU clause starting at 4:
3125; EG-NEXT:     MOV * T0.W, KC0[5].X,
3126; EG-NEXT:     MOV * T0.Z, KC0[4].W,
3127; EG-NEXT:     MOV T0.Y, KC0[4].Z,
3128; EG-NEXT:     MOV * T1.W, KC0[6].X,
3129; EG-NEXT:     MOV T0.X, KC0[4].Y,
3130; EG-NEXT:     MOV * T1.Z, KC0[5].W,
3131; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
3132; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
3133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3134; EG-NEXT:     MOV T1.X, KC0[5].Y,
3135; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3136; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3137; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
3138; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3139;
3140; CM-LABEL: v8i32_arg:
3141; CM:       ; %bb.0: ; %entry
3142; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3143; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3144; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3145; CM-NEXT:    CF_END
3146; CM-NEXT:    ALU clause starting at 4:
3147; CM-NEXT:     MOV * T0.W, KC0[6].X,
3148; CM-NEXT:     MOV * T0.Z, KC0[5].W,
3149; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
3150; CM-NEXT:     MOV T0.X, KC0[5].Y,
3151; CM-NEXT:     MOV * T1.W, KC0[5].X,
3152; CM-NEXT:     MOV T1.Z, KC0[4].W,
3153; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3154; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3155; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
3156; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
3157; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3158; CM-NEXT:     MOV * T1.X, KC0[4].Y,
3159; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
3160; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3161entry:
3162  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
3163  ret void
3164}
3165
3166define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
3167; SI-LABEL: v8f32_arg:
3168; SI:       ; %bb.0: ; %entry
3169; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3170; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3171; SI-NEXT:    s_mov_b32 s3, 0xf000
3172; SI-NEXT:    s_mov_b32 s2, -1
3173; SI-NEXT:    s_waitcnt lgkmcnt(0)
3174; SI-NEXT:    v_mov_b32_e32 v0, s8
3175; SI-NEXT:    v_mov_b32_e32 v1, s9
3176; SI-NEXT:    v_mov_b32_e32 v2, s10
3177; SI-NEXT:    v_mov_b32_e32 v3, s11
3178; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3179; SI-NEXT:    s_waitcnt expcnt(0)
3180; SI-NEXT:    v_mov_b32_e32 v0, s4
3181; SI-NEXT:    v_mov_b32_e32 v1, s5
3182; SI-NEXT:    v_mov_b32_e32 v2, s6
3183; SI-NEXT:    v_mov_b32_e32 v3, s7
3184; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3185; SI-NEXT:    s_endpgm
3186;
3187; VI-LABEL: v8f32_arg:
3188; VI:       ; %bb.0: ; %entry
3189; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
3190; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3191; VI-NEXT:    s_waitcnt lgkmcnt(0)
3192; VI-NEXT:    v_mov_b32_e32 v0, s8
3193; VI-NEXT:    s_add_u32 s2, s0, 16
3194; VI-NEXT:    s_addc_u32 s3, s1, 0
3195; VI-NEXT:    v_mov_b32_e32 v5, s3
3196; VI-NEXT:    v_mov_b32_e32 v1, s9
3197; VI-NEXT:    v_mov_b32_e32 v2, s10
3198; VI-NEXT:    v_mov_b32_e32 v3, s11
3199; VI-NEXT:    v_mov_b32_e32 v4, s2
3200; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3201; VI-NEXT:    v_mov_b32_e32 v5, s1
3202; VI-NEXT:    v_mov_b32_e32 v0, s4
3203; VI-NEXT:    v_mov_b32_e32 v1, s5
3204; VI-NEXT:    v_mov_b32_e32 v2, s6
3205; VI-NEXT:    v_mov_b32_e32 v3, s7
3206; VI-NEXT:    v_mov_b32_e32 v4, s0
3207; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3208; VI-NEXT:    s_endpgm
3209;
3210; GFX9-LABEL: v8f32_arg:
3211; GFX9:       ; %bb.0: ; %entry
3212; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3213; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3214; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3215; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3216; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3217; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3218; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3219; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3220; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3221; GFX9-NEXT:    s_nop 0
3222; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3223; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3224; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3225; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3226; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3227; GFX9-NEXT:    s_endpgm
3228;
3229; EG-LABEL: v8f32_arg:
3230; EG:       ; %bb.0: ; %entry
3231; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3232; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3233; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3234; EG-NEXT:    CF_END
3235; EG-NEXT:    ALU clause starting at 4:
3236; EG-NEXT:     MOV * T0.W, KC0[5].X,
3237; EG-NEXT:     MOV * T0.Z, KC0[4].W,
3238; EG-NEXT:     MOV T0.Y, KC0[4].Z,
3239; EG-NEXT:     MOV * T1.W, KC0[6].X,
3240; EG-NEXT:     MOV T0.X, KC0[4].Y,
3241; EG-NEXT:     MOV * T1.Z, KC0[5].W,
3242; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
3243; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
3244; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3245; EG-NEXT:     MOV T1.X, KC0[5].Y,
3246; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3247; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3248; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
3249; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3250;
3251; CM-LABEL: v8f32_arg:
3252; CM:       ; %bb.0: ; %entry
3253; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3254; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3255; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3256; CM-NEXT:    CF_END
3257; CM-NEXT:    ALU clause starting at 4:
3258; CM-NEXT:     MOV * T0.W, KC0[6].X,
3259; CM-NEXT:     MOV * T0.Z, KC0[5].W,
3260; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
3261; CM-NEXT:     MOV T0.X, KC0[5].Y,
3262; CM-NEXT:     MOV * T1.W, KC0[5].X,
3263; CM-NEXT:     MOV T1.Z, KC0[4].W,
3264; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3265; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3266; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
3267; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
3268; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3269; CM-NEXT:     MOV * T1.X, KC0[4].Y,
3270; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
3271; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3272entry:
3273  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
3274  ret void
3275}
3276
3277; FIXME: Pack/repack on VI
3278define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
3279; SI-LABEL: v16i8_arg:
3280; SI:       ; %bb.0: ; %entry
3281; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
3282; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3283; SI-NEXT:    s_mov_b32 s3, 0xf000
3284; SI-NEXT:    s_mov_b32 s2, -1
3285; SI-NEXT:    s_waitcnt lgkmcnt(0)
3286; SI-NEXT:    v_mov_b32_e32 v0, s4
3287; SI-NEXT:    v_mov_b32_e32 v1, s5
3288; SI-NEXT:    v_mov_b32_e32 v2, s6
3289; SI-NEXT:    v_mov_b32_e32 v3, s7
3290; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3291; SI-NEXT:    s_endpgm
3292;
3293; VI-LABEL: v16i8_arg:
3294; VI:       ; %bb.0: ; %entry
3295; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
3296; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
3297; VI-NEXT:    s_waitcnt lgkmcnt(0)
3298; VI-NEXT:    v_mov_b32_e32 v4, s4
3299; VI-NEXT:    v_mov_b32_e32 v0, s0
3300; VI-NEXT:    v_mov_b32_e32 v5, s5
3301; VI-NEXT:    v_mov_b32_e32 v1, s1
3302; VI-NEXT:    v_mov_b32_e32 v2, s2
3303; VI-NEXT:    v_mov_b32_e32 v3, s3
3304; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3305; VI-NEXT:    s_endpgm
3306;
3307; GFX9-LABEL: v16i8_arg:
3308; GFX9:       ; %bb.0: ; %entry
3309; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
3310; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
3311; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3313; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3314; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3315; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3316; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3317; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
3318; GFX9-NEXT:    s_endpgm
3319;
3320; EG-LABEL: v16i8_arg:
3321; EG:       ; %bb.0: ; %entry
3322; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3323; EG-NEXT:    TEX 0 @36
3324; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3325; EG-NEXT:    TEX 0 @38
3326; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3327; EG-NEXT:    TEX 0 @40
3328; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3329; EG-NEXT:    TEX 0 @42
3330; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3331; EG-NEXT:    TEX 0 @44
3332; EG-NEXT:    ALU 7, @94, KC0[], KC1[]
3333; EG-NEXT:    TEX 0 @46
3334; EG-NEXT:    ALU 7, @102, KC0[], KC1[]
3335; EG-NEXT:    TEX 0 @48
3336; EG-NEXT:    ALU 7, @110, KC0[], KC1[]
3337; EG-NEXT:    TEX 0 @50
3338; EG-NEXT:    ALU 7, @118, KC0[], KC1[]
3339; EG-NEXT:    TEX 0 @52
3340; EG-NEXT:    ALU 7, @126, KC0[], KC1[]
3341; EG-NEXT:    TEX 0 @54
3342; EG-NEXT:    ALU 7, @134, KC0[], KC1[]
3343; EG-NEXT:    TEX 0 @56
3344; EG-NEXT:    ALU 7, @142, KC0[], KC1[]
3345; EG-NEXT:    TEX 0 @58
3346; EG-NEXT:    ALU 7, @150, KC0[], KC1[]
3347; EG-NEXT:    TEX 0 @60
3348; EG-NEXT:    ALU 5, @158, KC0[], KC1[]
3349; EG-NEXT:    TEX 0 @62
3350; EG-NEXT:    ALU 5, @164, KC0[], KC1[]
3351; EG-NEXT:    TEX 0 @64
3352; EG-NEXT:    ALU 5, @170, KC0[], KC1[]
3353; EG-NEXT:    TEX 0 @66
3354; EG-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3355; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
3356; EG-NEXT:    CF_END
3357; EG-NEXT:    PAD
3358; EG-NEXT:    Fetch clause starting at 36:
3359; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3360; EG-NEXT:    Fetch clause starting at 38:
3361; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3362; EG-NEXT:    Fetch clause starting at 40:
3363; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3364; EG-NEXT:    Fetch clause starting at 42:
3365; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3366; EG-NEXT:    Fetch clause starting at 44:
3367; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3368; EG-NEXT:    Fetch clause starting at 46:
3369; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3370; EG-NEXT:    Fetch clause starting at 48:
3371; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3372; EG-NEXT:    Fetch clause starting at 50:
3373; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3374; EG-NEXT:    Fetch clause starting at 52:
3375; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3376; EG-NEXT:    Fetch clause starting at 54:
3377; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3378; EG-NEXT:    Fetch clause starting at 56:
3379; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3380; EG-NEXT:    Fetch clause starting at 58:
3381; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3382; EG-NEXT:    Fetch clause starting at 60:
3383; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3384; EG-NEXT:    Fetch clause starting at 62:
3385; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3386; EG-NEXT:    Fetch clause starting at 64:
3387; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3388; EG-NEXT:    Fetch clause starting at 66:
3389; EG-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3390; EG-NEXT:    ALU clause starting at 68:
3391; EG-NEXT:     MOV * T0.Y, T2.X,
3392; EG-NEXT:     MOV * T7.X, 0.0,
3393; EG-NEXT:    ALU clause starting at 70:
3394; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
3395; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3396; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3397; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3398; EG-NEXT:     MOV T2.X, PV.W,
3399; EG-NEXT:     MOV * T0.Y, T3.X,
3400; EG-NEXT:    ALU clause starting at 76:
3401; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3402; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3403; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3404; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3405; EG-NEXT:     MOV T3.X, PV.W,
3406; EG-NEXT:     MOV * T0.Y, T4.X,
3407; EG-NEXT:    ALU clause starting at 82:
3408; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3409; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3410; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3411; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3412; EG-NEXT:     MOV T4.X, PV.W,
3413; EG-NEXT:     MOV * T0.Y, T5.X,
3414; EG-NEXT:    ALU clause starting at 88:
3415; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3416; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3417; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3418; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3419; EG-NEXT:     MOV T5.X, PV.W,
3420; EG-NEXT:     MOV * T0.Y, T2.X,
3421; EG-NEXT:    ALU clause starting at 94:
3422; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3423; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3424; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3425; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3426; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3427; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3428; EG-NEXT:     MOV T2.X, PV.W,
3429; EG-NEXT:     MOV * T0.Y, T3.X,
3430; EG-NEXT:    ALU clause starting at 102:
3431; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3432; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3433; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3434; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3435; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3436; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3437; EG-NEXT:     MOV T3.X, PV.W,
3438; EG-NEXT:     MOV * T0.Y, T4.X,
3439; EG-NEXT:    ALU clause starting at 110:
3440; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3441; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3442; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3443; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3444; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3445; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3446; EG-NEXT:     MOV T4.X, PV.W,
3447; EG-NEXT:     MOV * T0.Y, T5.X,
3448; EG-NEXT:    ALU clause starting at 118:
3449; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3450; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3451; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3452; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3453; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3454; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3455; EG-NEXT:     MOV T5.X, PV.W,
3456; EG-NEXT:     MOV * T0.Y, T2.X,
3457; EG-NEXT:    ALU clause starting at 126:
3458; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3459; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3460; EG-NEXT:    255(3.573311e-43), -65281(nan)
3461; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3462; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3463; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3464; EG-NEXT:     MOV T2.X, PV.W,
3465; EG-NEXT:     MOV * T0.Y, T3.X,
3466; EG-NEXT:    ALU clause starting at 134:
3467; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3468; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3469; EG-NEXT:    255(3.573311e-43), -65281(nan)
3470; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3471; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3472; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3473; EG-NEXT:     MOV T3.X, PV.W,
3474; EG-NEXT:     MOV * T0.Y, T4.X,
3475; EG-NEXT:    ALU clause starting at 142:
3476; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3477; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3478; EG-NEXT:    255(3.573311e-43), -65281(nan)
3479; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3480; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3481; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3482; EG-NEXT:     MOV T4.X, PV.W,
3483; EG-NEXT:     MOV * T0.Y, T5.X,
3484; EG-NEXT:    ALU clause starting at 150:
3485; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3486; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3487; EG-NEXT:    255(3.573311e-43), -65281(nan)
3488; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3489; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3490; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3491; EG-NEXT:     MOV T5.X, PV.W,
3492; EG-NEXT:     MOV * T0.Y, T2.X,
3493; EG-NEXT:    ALU clause starting at 158:
3494; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3495; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3496; EG-NEXT:    -256(nan), 255(3.573311e-43)
3497; EG-NEXT:     OR_INT * T7.W, PV.W, PS,
3498; EG-NEXT:     MOV T2.X, PV.W,
3499; EG-NEXT:     MOV * T0.Y, T3.X,
3500; EG-NEXT:    ALU clause starting at 164:
3501; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3502; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3503; EG-NEXT:    -256(nan), 255(3.573311e-43)
3504; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
3505; EG-NEXT:     MOV T3.X, PV.Z,
3506; EG-NEXT:     MOV * T0.Y, T4.X,
3507; EG-NEXT:    ALU clause starting at 170:
3508; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3509; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3510; EG-NEXT:    -256(nan), 255(3.573311e-43)
3511; EG-NEXT:     OR_INT * T7.Y, PV.W, PS,
3512; EG-NEXT:     MOV T4.X, PV.Y,
3513; EG-NEXT:     MOV * T0.Y, T5.X,
3514; EG-NEXT:    ALU clause starting at 176:
3515; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3516; EG-NEXT:     AND_INT * T1.W, T7.X, literal.y,
3517; EG-NEXT:    -256(nan), 255(3.573311e-43)
3518; EG-NEXT:     OR_INT T7.X, PV.W, PS,
3519; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3520; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3521;
3522; CM-LABEL: v16i8_arg:
3523; CM:       ; %bb.0: ; %entry
3524; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3525; CM-NEXT:    TEX 0 @36
3526; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3527; CM-NEXT:    TEX 0 @38
3528; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3529; CM-NEXT:    TEX 0 @40
3530; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3531; CM-NEXT:    TEX 0 @42
3532; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3533; CM-NEXT:    TEX 0 @44
3534; CM-NEXT:    ALU 7, @94, KC0[], KC1[]
3535; CM-NEXT:    TEX 0 @46
3536; CM-NEXT:    ALU 7, @102, KC0[], KC1[]
3537; CM-NEXT:    TEX 0 @48
3538; CM-NEXT:    ALU 7, @110, KC0[], KC1[]
3539; CM-NEXT:    TEX 0 @50
3540; CM-NEXT:    ALU 7, @118, KC0[], KC1[]
3541; CM-NEXT:    TEX 0 @52
3542; CM-NEXT:    ALU 7, @126, KC0[], KC1[]
3543; CM-NEXT:    TEX 0 @54
3544; CM-NEXT:    ALU 7, @134, KC0[], KC1[]
3545; CM-NEXT:    TEX 0 @56
3546; CM-NEXT:    ALU 7, @142, KC0[], KC1[]
3547; CM-NEXT:    TEX 0 @58
3548; CM-NEXT:    ALU 7, @150, KC0[], KC1[]
3549; CM-NEXT:    TEX 0 @60
3550; CM-NEXT:    ALU 5, @158, KC0[], KC1[]
3551; CM-NEXT:    TEX 0 @62
3552; CM-NEXT:    ALU 5, @164, KC0[], KC1[]
3553; CM-NEXT:    TEX 0 @64
3554; CM-NEXT:    ALU 5, @170, KC0[], KC1[]
3555; CM-NEXT:    TEX 0 @66
3556; CM-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3557; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
3558; CM-NEXT:    CF_END
3559; CM-NEXT:    PAD
3560; CM-NEXT:    Fetch clause starting at 36:
3561; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3562; CM-NEXT:    Fetch clause starting at 38:
3563; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3564; CM-NEXT:    Fetch clause starting at 40:
3565; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3566; CM-NEXT:    Fetch clause starting at 42:
3567; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3568; CM-NEXT:    Fetch clause starting at 44:
3569; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3570; CM-NEXT:    Fetch clause starting at 46:
3571; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3572; CM-NEXT:    Fetch clause starting at 48:
3573; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3574; CM-NEXT:    Fetch clause starting at 50:
3575; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3576; CM-NEXT:    Fetch clause starting at 52:
3577; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3578; CM-NEXT:    Fetch clause starting at 54:
3579; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3580; CM-NEXT:    Fetch clause starting at 56:
3581; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3582; CM-NEXT:    Fetch clause starting at 58:
3583; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3584; CM-NEXT:    Fetch clause starting at 60:
3585; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3586; CM-NEXT:    Fetch clause starting at 62:
3587; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3588; CM-NEXT:    Fetch clause starting at 64:
3589; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3590; CM-NEXT:    Fetch clause starting at 66:
3591; CM-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3592; CM-NEXT:    ALU clause starting at 68:
3593; CM-NEXT:     MOV * T0.Y, T2.X,
3594; CM-NEXT:     MOV * T7.X, 0.0,
3595; CM-NEXT:    ALU clause starting at 70:
3596; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
3597; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
3598; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3599; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
3600; CM-NEXT:     MOV T2.X, PV.W,
3601; CM-NEXT:     MOV * T0.Y, T3.X,
3602; CM-NEXT:    ALU clause starting at 76:
3603; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3604; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3605; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3606; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3607; CM-NEXT:     MOV T3.X, PV.W,
3608; CM-NEXT:     MOV * T0.Y, T4.X,
3609; CM-NEXT:    ALU clause starting at 82:
3610; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3611; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3612; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3613; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3614; CM-NEXT:     MOV T4.X, PV.W,
3615; CM-NEXT:     MOV * T0.Y, T5.X,
3616; CM-NEXT:    ALU clause starting at 88:
3617; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3618; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3619; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3620; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3621; CM-NEXT:     MOV T5.X, PV.W,
3622; CM-NEXT:     MOV * T0.Y, T2.X,
3623; CM-NEXT:    ALU clause starting at 94:
3624; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3625; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3626; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3627; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3628; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3629; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3630; CM-NEXT:     MOV T2.X, PV.W,
3631; CM-NEXT:     MOV * T0.Y, T3.X,
3632; CM-NEXT:    ALU clause starting at 102:
3633; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3634; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3635; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3636; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3637; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3638; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3639; CM-NEXT:     MOV T3.X, PV.W,
3640; CM-NEXT:     MOV * T0.Y, T4.X,
3641; CM-NEXT:    ALU clause starting at 110:
3642; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3643; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3644; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3645; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3646; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3647; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3648; CM-NEXT:     MOV T4.X, PV.W,
3649; CM-NEXT:     MOV * T0.Y, T5.X,
3650; CM-NEXT:    ALU clause starting at 118:
3651; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3652; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3653; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3654; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3655; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3656; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3657; CM-NEXT:     MOV T5.X, PV.W,
3658; CM-NEXT:     MOV * T0.Y, T2.X,
3659; CM-NEXT:    ALU clause starting at 126:
3660; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3661; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3662; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3663; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3664; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3665; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3666; CM-NEXT:     MOV T2.X, PV.W,
3667; CM-NEXT:     MOV * T0.Y, T3.X,
3668; CM-NEXT:    ALU clause starting at 134:
3669; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3670; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3671; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3672; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3673; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3674; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3675; CM-NEXT:     MOV T3.X, PV.W,
3676; CM-NEXT:     MOV * T0.Y, T4.X,
3677; CM-NEXT:    ALU clause starting at 142:
3678; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3679; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3680; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3681; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3682; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3683; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3684; CM-NEXT:     MOV T4.X, PV.W,
3685; CM-NEXT:     MOV * T0.Y, T5.X,
3686; CM-NEXT:    ALU clause starting at 150:
3687; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3688; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3689; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3690; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3691; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3692; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3693; CM-NEXT:     MOV T5.X, PV.W,
3694; CM-NEXT:     MOV * T0.Y, T2.X,
3695; CM-NEXT:    ALU clause starting at 158:
3696; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3697; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3698; CM-NEXT:    -256(nan), 255(3.573311e-43)
3699; CM-NEXT:     OR_INT * T7.W, PV.Z, PV.W,
3700; CM-NEXT:     MOV T2.X, PV.W,
3701; CM-NEXT:     MOV * T0.Y, T3.X,
3702; CM-NEXT:    ALU clause starting at 164:
3703; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3704; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3705; CM-NEXT:    -256(nan), 255(3.573311e-43)
3706; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
3707; CM-NEXT:     MOV T3.X, PV.Z,
3708; CM-NEXT:     MOV * T0.Y, T4.X,
3709; CM-NEXT:    ALU clause starting at 170:
3710; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3711; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3712; CM-NEXT:    -256(nan), 255(3.573311e-43)
3713; CM-NEXT:     OR_INT * T7.Y, PV.Z, PV.W,
3714; CM-NEXT:     MOV T4.X, PV.Y,
3715; CM-NEXT:     MOV * T0.Y, T5.X,
3716; CM-NEXT:    ALU clause starting at 176:
3717; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3718; CM-NEXT:     AND_INT * T0.W, T7.X, literal.y,
3719; CM-NEXT:    -256(nan), 255(3.573311e-43)
3720; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
3721; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3722; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3723entry:
3724  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
3725  ret void
3726}
3727
3728define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
3729; SI-LABEL: v16i16_arg:
3730; SI:       ; %bb.0: ; %entry
3731; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3732; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3733; SI-NEXT:    s_mov_b32 s3, 0xf000
3734; SI-NEXT:    s_mov_b32 s2, -1
3735; SI-NEXT:    s_waitcnt lgkmcnt(0)
3736; SI-NEXT:    v_mov_b32_e32 v0, s8
3737; SI-NEXT:    v_mov_b32_e32 v1, s9
3738; SI-NEXT:    v_mov_b32_e32 v2, s10
3739; SI-NEXT:    v_mov_b32_e32 v3, s11
3740; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3741; SI-NEXT:    s_waitcnt expcnt(0)
3742; SI-NEXT:    v_mov_b32_e32 v0, s4
3743; SI-NEXT:    v_mov_b32_e32 v1, s5
3744; SI-NEXT:    v_mov_b32_e32 v2, s6
3745; SI-NEXT:    v_mov_b32_e32 v3, s7
3746; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3747; SI-NEXT:    s_endpgm
3748;
3749; VI-LABEL: v16i16_arg:
3750; VI:       ; %bb.0: ; %entry
3751; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
3752; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
3753; VI-NEXT:    s_waitcnt lgkmcnt(0)
3754; VI-NEXT:    v_mov_b32_e32 v0, s8
3755; VI-NEXT:    s_add_u32 s2, s0, 16
3756; VI-NEXT:    s_addc_u32 s3, s1, 0
3757; VI-NEXT:    v_mov_b32_e32 v5, s3
3758; VI-NEXT:    v_mov_b32_e32 v1, s9
3759; VI-NEXT:    v_mov_b32_e32 v2, s10
3760; VI-NEXT:    v_mov_b32_e32 v3, s11
3761; VI-NEXT:    v_mov_b32_e32 v4, s2
3762; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3763; VI-NEXT:    v_mov_b32_e32 v5, s1
3764; VI-NEXT:    v_mov_b32_e32 v0, s4
3765; VI-NEXT:    v_mov_b32_e32 v1, s5
3766; VI-NEXT:    v_mov_b32_e32 v2, s6
3767; VI-NEXT:    v_mov_b32_e32 v3, s7
3768; VI-NEXT:    v_mov_b32_e32 v4, s0
3769; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3770; VI-NEXT:    s_endpgm
3771;
3772; GFX9-LABEL: v16i16_arg:
3773; GFX9:       ; %bb.0: ; %entry
3774; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3776; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3777; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3778; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3779; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3780; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3781; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3782; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3783; GFX9-NEXT:    s_nop 0
3784; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3785; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3786; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3787; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3788; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3789; GFX9-NEXT:    s_endpgm
3790;
3791; EG-LABEL: v16i16_arg:
3792; EG:       ; %bb.0: ; %entry
3793; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3794; EG-NEXT:    TEX 0 @36
3795; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3796; EG-NEXT:    TEX 0 @38
3797; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3798; EG-NEXT:    TEX 0 @40
3799; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3800; EG-NEXT:    TEX 0 @42
3801; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3802; EG-NEXT:    TEX 0 @44
3803; EG-NEXT:    ALU 5, @94, KC0[], KC1[]
3804; EG-NEXT:    TEX 0 @46
3805; EG-NEXT:    ALU 5, @100, KC0[], KC1[]
3806; EG-NEXT:    TEX 0 @48
3807; EG-NEXT:    ALU 5, @106, KC0[], KC1[]
3808; EG-NEXT:    TEX 0 @50
3809; EG-NEXT:    ALU 5, @112, KC0[], KC1[]
3810; EG-NEXT:    TEX 0 @52
3811; EG-NEXT:    ALU 5, @118, KC0[], KC1[]
3812; EG-NEXT:    TEX 0 @54
3813; EG-NEXT:    ALU 5, @124, KC0[], KC1[]
3814; EG-NEXT:    TEX 0 @56
3815; EG-NEXT:    ALU 5, @130, KC0[], KC1[]
3816; EG-NEXT:    TEX 0 @58
3817; EG-NEXT:    ALU 5, @136, KC0[], KC1[]
3818; EG-NEXT:    TEX 0 @60
3819; EG-NEXT:    ALU 5, @142, KC0[], KC1[]
3820; EG-NEXT:    TEX 0 @62
3821; EG-NEXT:    ALU 5, @148, KC0[], KC1[]
3822; EG-NEXT:    TEX 0 @64
3823; EG-NEXT:    ALU 5, @154, KC0[], KC1[]
3824; EG-NEXT:    TEX 0 @66
3825; EG-NEXT:    ALU 13, @160, KC0[CB0:0-32], KC1[]
3826; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
3827; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
3828; EG-NEXT:    CF_END
3829; EG-NEXT:    Fetch clause starting at 36:
3830; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
3831; EG-NEXT:    Fetch clause starting at 38:
3832; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
3833; EG-NEXT:    Fetch clause starting at 40:
3834; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
3835; EG-NEXT:    Fetch clause starting at 42:
3836; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
3837; EG-NEXT:    Fetch clause starting at 44:
3838; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
3839; EG-NEXT:    Fetch clause starting at 46:
3840; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
3841; EG-NEXT:    Fetch clause starting at 48:
3842; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
3843; EG-NEXT:    Fetch clause starting at 50:
3844; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
3845; EG-NEXT:    Fetch clause starting at 52:
3846; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
3847; EG-NEXT:    Fetch clause starting at 54:
3848; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
3849; EG-NEXT:    Fetch clause starting at 56:
3850; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
3851; EG-NEXT:    Fetch clause starting at 58:
3852; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
3853; EG-NEXT:    Fetch clause starting at 60:
3854; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
3855; EG-NEXT:    Fetch clause starting at 62:
3856; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
3857; EG-NEXT:    Fetch clause starting at 64:
3858; EG-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
3859; EG-NEXT:    Fetch clause starting at 66:
3860; EG-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
3861; EG-NEXT:    ALU clause starting at 68:
3862; EG-NEXT:     MOV * T0.Y, T3.X,
3863; EG-NEXT:     MOV * T11.X, 0.0,
3864; EG-NEXT:    ALU clause starting at 70:
3865; EG-NEXT:     LSHL T0.W, T12.X, literal.x,
3866; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3867; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
3868; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3869; EG-NEXT:     MOV T3.X, PV.W,
3870; EG-NEXT:     MOV * T0.Y, T5.X,
3871; EG-NEXT:    ALU clause starting at 76:
3872; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3873; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3874; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3875; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3876; EG-NEXT:     MOV T5.X, PV.W,
3877; EG-NEXT:     MOV * T0.Y, T7.X,
3878; EG-NEXT:    ALU clause starting at 82:
3879; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3880; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3881; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3882; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3883; EG-NEXT:     MOV T7.X, PV.W,
3884; EG-NEXT:     MOV * T0.Y, T9.X,
3885; EG-NEXT:    ALU clause starting at 88:
3886; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3887; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3888; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3889; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3890; EG-NEXT:     MOV T9.X, PV.W,
3891; EG-NEXT:     MOV * T0.Y, T3.X,
3892; EG-NEXT:    ALU clause starting at 94:
3893; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3894; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3895; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3896; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3897; EG-NEXT:     MOV T3.X, PV.W,
3898; EG-NEXT:     MOV * T0.Y, T5.X,
3899; EG-NEXT:    ALU clause starting at 100:
3900; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3901; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3902; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3903; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3904; EG-NEXT:     MOV T5.X, PV.W,
3905; EG-NEXT:     MOV * T0.Y, T7.X,
3906; EG-NEXT:    ALU clause starting at 106:
3907; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3908; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3909; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3910; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3911; EG-NEXT:     MOV T7.X, PV.W,
3912; EG-NEXT:     MOV * T0.Y, T9.X,
3913; EG-NEXT:    ALU clause starting at 112:
3914; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3915; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3916; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3917; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3918; EG-NEXT:     MOV T9.X, PV.W,
3919; EG-NEXT:     MOV * T0.Y, T2.X,
3920; EG-NEXT:    ALU clause starting at 118:
3921; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3922; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3923; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3924; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3925; EG-NEXT:     MOV T2.X, PV.W,
3926; EG-NEXT:     MOV * T0.Y, T4.X,
3927; EG-NEXT:    ALU clause starting at 124:
3928; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3929; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3930; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3931; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3932; EG-NEXT:     MOV T4.X, PV.W,
3933; EG-NEXT:     MOV * T0.Y, T6.X,
3934; EG-NEXT:    ALU clause starting at 130:
3935; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3936; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3937; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3938; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3939; EG-NEXT:     MOV T6.X, PV.W,
3940; EG-NEXT:     MOV * T0.Y, T8.X,
3941; EG-NEXT:    ALU clause starting at 136:
3942; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3943; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3944; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3945; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3946; EG-NEXT:     MOV T8.X, PV.W,
3947; EG-NEXT:     MOV * T0.Y, T2.X,
3948; EG-NEXT:    ALU clause starting at 142:
3949; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3950; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3951; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3952; EG-NEXT:     OR_INT * T12.Z, PV.W, PS,
3953; EG-NEXT:     MOV T2.X, PV.Z,
3954; EG-NEXT:     MOV * T0.Y, T4.X,
3955; EG-NEXT:    ALU clause starting at 148:
3956; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3957; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3958; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3959; EG-NEXT:     OR_INT * T12.X, PV.W, PS,
3960; EG-NEXT:     MOV T4.X, PV.X,
3961; EG-NEXT:     MOV * T0.Y, T6.X,
3962; EG-NEXT:    ALU clause starting at 154:
3963; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3964; EG-NEXT:     AND_INT * T1.W, T13.X, literal.y,
3965; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3966; EG-NEXT:     OR_INT * T11.Z, PV.W, PS,
3967; EG-NEXT:     MOV T6.X, PV.Z,
3968; EG-NEXT:     MOV * T0.Y, T8.X,
3969; EG-NEXT:    ALU clause starting at 160:
3970; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
3971; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3972; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3973; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
3974; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
3975; EG-NEXT:     AND_INT * T1.W, T11.X, literal.z,
3976; EG-NEXT:    2(2.802597e-45), -65536(nan)
3977; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3978; EG-NEXT:     OR_INT * T11.X, PV.W, PS,
3979; EG-NEXT:     MOV T8.X, PV.X,
3980; EG-NEXT:     MOV * T12.W, T3.X,
3981; EG-NEXT:     MOV T12.Y, T5.X,
3982; EG-NEXT:     MOV T11.W, T7.X, BS:VEC_120/SCL_212
3983; EG-NEXT:     MOV * T11.Y, T9.X,
3984;
3985; CM-LABEL: v16i16_arg:
3986; CM:       ; %bb.0: ; %entry
3987; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3988; CM-NEXT:    TEX 0 @36
3989; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3990; CM-NEXT:    TEX 0 @38
3991; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3992; CM-NEXT:    TEX 0 @40
3993; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3994; CM-NEXT:    TEX 0 @42
3995; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3996; CM-NEXT:    TEX 0 @44
3997; CM-NEXT:    ALU 5, @94, KC0[], KC1[]
3998; CM-NEXT:    TEX 0 @46
3999; CM-NEXT:    ALU 5, @100, KC0[], KC1[]
4000; CM-NEXT:    TEX 0 @48
4001; CM-NEXT:    ALU 5, @106, KC0[], KC1[]
4002; CM-NEXT:    TEX 0 @50
4003; CM-NEXT:    ALU 5, @112, KC0[], KC1[]
4004; CM-NEXT:    TEX 0 @52
4005; CM-NEXT:    ALU 5, @118, KC0[], KC1[]
4006; CM-NEXT:    TEX 0 @54
4007; CM-NEXT:    ALU 5, @124, KC0[], KC1[]
4008; CM-NEXT:    TEX 0 @56
4009; CM-NEXT:    ALU 5, @130, KC0[], KC1[]
4010; CM-NEXT:    TEX 0 @58
4011; CM-NEXT:    ALU 5, @136, KC0[], KC1[]
4012; CM-NEXT:    TEX 0 @60
4013; CM-NEXT:    ALU 5, @142, KC0[], KC1[]
4014; CM-NEXT:    TEX 0 @62
4015; CM-NEXT:    ALU 5, @148, KC0[], KC1[]
4016; CM-NEXT:    TEX 0 @64
4017; CM-NEXT:    ALU 5, @154, KC0[], KC1[]
4018; CM-NEXT:    TEX 0 @66
4019; CM-NEXT:    ALU 14, @160, KC0[CB0:0-32], KC1[]
4020; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
4021; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
4022; CM-NEXT:    CF_END
4023; CM-NEXT:    Fetch clause starting at 36:
4024; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
4025; CM-NEXT:    Fetch clause starting at 38:
4026; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
4027; CM-NEXT:    Fetch clause starting at 40:
4028; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
4029; CM-NEXT:    Fetch clause starting at 42:
4030; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
4031; CM-NEXT:    Fetch clause starting at 44:
4032; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
4033; CM-NEXT:    Fetch clause starting at 46:
4034; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
4035; CM-NEXT:    Fetch clause starting at 48:
4036; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
4037; CM-NEXT:    Fetch clause starting at 50:
4038; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
4039; CM-NEXT:    Fetch clause starting at 52:
4040; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
4041; CM-NEXT:    Fetch clause starting at 54:
4042; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
4043; CM-NEXT:    Fetch clause starting at 56:
4044; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
4045; CM-NEXT:    Fetch clause starting at 58:
4046; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
4047; CM-NEXT:    Fetch clause starting at 60:
4048; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
4049; CM-NEXT:    Fetch clause starting at 62:
4050; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
4051; CM-NEXT:    Fetch clause starting at 64:
4052; CM-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
4053; CM-NEXT:    Fetch clause starting at 66:
4054; CM-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
4055; CM-NEXT:    ALU clause starting at 68:
4056; CM-NEXT:     MOV * T0.Y, T3.X,
4057; CM-NEXT:     MOV * T11.X, 0.0,
4058; CM-NEXT:    ALU clause starting at 70:
4059; CM-NEXT:     LSHL T0.Z, T12.X, literal.x,
4060; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
4061; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
4062; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
4063; CM-NEXT:     MOV T3.X, PV.W,
4064; CM-NEXT:     MOV * T0.Y, T5.X,
4065; CM-NEXT:    ALU clause starting at 76:
4066; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4067; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4068; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4069; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4070; CM-NEXT:     MOV T5.X, PV.W,
4071; CM-NEXT:     MOV * T0.Y, T7.X,
4072; CM-NEXT:    ALU clause starting at 82:
4073; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4074; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4075; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4076; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4077; CM-NEXT:     MOV T7.X, PV.W,
4078; CM-NEXT:     MOV * T0.Y, T9.X,
4079; CM-NEXT:    ALU clause starting at 88:
4080; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4081; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4082; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4083; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4084; CM-NEXT:     MOV T9.X, PV.W,
4085; CM-NEXT:     MOV * T0.Y, T3.X,
4086; CM-NEXT:    ALU clause starting at 94:
4087; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4088; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4089; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4090; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4091; CM-NEXT:     MOV T3.X, PV.W,
4092; CM-NEXT:     MOV * T0.Y, T5.X,
4093; CM-NEXT:    ALU clause starting at 100:
4094; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4095; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4096; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4097; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4098; CM-NEXT:     MOV T5.X, PV.W,
4099; CM-NEXT:     MOV * T0.Y, T7.X,
4100; CM-NEXT:    ALU clause starting at 106:
4101; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4102; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4103; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4104; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4105; CM-NEXT:     MOV T7.X, PV.W,
4106; CM-NEXT:     MOV * T0.Y, T9.X,
4107; CM-NEXT:    ALU clause starting at 112:
4108; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4109; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4110; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4111; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4112; CM-NEXT:     MOV T9.X, PV.W,
4113; CM-NEXT:     MOV * T0.Y, T2.X,
4114; CM-NEXT:    ALU clause starting at 118:
4115; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4116; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4117; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4118; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4119; CM-NEXT:     MOV T2.X, PV.W,
4120; CM-NEXT:     MOV * T0.Y, T4.X,
4121; CM-NEXT:    ALU clause starting at 124:
4122; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4123; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4124; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4125; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4126; CM-NEXT:     MOV T4.X, PV.W,
4127; CM-NEXT:     MOV * T0.Y, T6.X,
4128; CM-NEXT:    ALU clause starting at 130:
4129; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4130; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4131; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4132; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4133; CM-NEXT:     MOV T6.X, PV.W,
4134; CM-NEXT:     MOV * T0.Y, T8.X,
4135; CM-NEXT:    ALU clause starting at 136:
4136; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4137; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4138; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4139; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4140; CM-NEXT:     MOV T8.X, PV.W,
4141; CM-NEXT:     MOV * T0.Y, T2.X,
4142; CM-NEXT:    ALU clause starting at 142:
4143; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4144; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4145; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4146; CM-NEXT:     OR_INT * T12.Z, PV.Z, PV.W,
4147; CM-NEXT:     MOV T2.X, PV.Z,
4148; CM-NEXT:     MOV * T0.Y, T4.X,
4149; CM-NEXT:    ALU clause starting at 148:
4150; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4151; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4152; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4153; CM-NEXT:     OR_INT * T12.X, PV.Z, PV.W,
4154; CM-NEXT:     MOV T4.X, PV.X,
4155; CM-NEXT:     MOV * T0.Y, T6.X,
4156; CM-NEXT:    ALU clause starting at 154:
4157; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4158; CM-NEXT:     AND_INT * T0.W, T13.X, literal.y,
4159; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4160; CM-NEXT:     OR_INT * T11.Z, PV.Z, PV.W,
4161; CM-NEXT:     MOV T6.X, PV.Z,
4162; CM-NEXT:     MOV * T0.Y, T8.X,
4163; CM-NEXT:    ALU clause starting at 160:
4164; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4165; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4166; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
4167; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4168; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
4169; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
4170; CM-NEXT:     AND_INT * T0.W, T11.X, literal.z,
4171; CM-NEXT:    2(2.802597e-45), -65536(nan)
4172; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4173; CM-NEXT:     OR_INT * T11.X, PV.Z, PV.W,
4174; CM-NEXT:     MOV T8.X, PV.X,
4175; CM-NEXT:     MOV * T12.W, T3.X,
4176; CM-NEXT:     MOV T12.Y, T5.X,
4177; CM-NEXT:     MOV * T11.W, T7.X, BS:VEC_120/SCL_212
4178; CM-NEXT:     MOV * T11.Y, T9.X,
4179entry:
4180  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
4181  ret void
4182}
4183
4184define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
4185; SI-LABEL: v16i32_arg:
4186; SI:       ; %bb.0: ; %entry
4187; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
4188; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4189; SI-NEXT:    s_mov_b32 s3, 0xf000
4190; SI-NEXT:    s_mov_b32 s2, -1
4191; SI-NEXT:    s_waitcnt lgkmcnt(0)
4192; SI-NEXT:    v_mov_b32_e32 v0, s16
4193; SI-NEXT:    v_mov_b32_e32 v1, s17
4194; SI-NEXT:    v_mov_b32_e32 v2, s18
4195; SI-NEXT:    v_mov_b32_e32 v3, s19
4196; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4197; SI-NEXT:    s_waitcnt expcnt(0)
4198; SI-NEXT:    v_mov_b32_e32 v0, s12
4199; SI-NEXT:    v_mov_b32_e32 v1, s13
4200; SI-NEXT:    v_mov_b32_e32 v2, s14
4201; SI-NEXT:    v_mov_b32_e32 v3, s15
4202; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4203; SI-NEXT:    s_waitcnt expcnt(0)
4204; SI-NEXT:    v_mov_b32_e32 v0, s8
4205; SI-NEXT:    v_mov_b32_e32 v1, s9
4206; SI-NEXT:    v_mov_b32_e32 v2, s10
4207; SI-NEXT:    v_mov_b32_e32 v3, s11
4208; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4209; SI-NEXT:    s_waitcnt expcnt(0)
4210; SI-NEXT:    v_mov_b32_e32 v0, s4
4211; SI-NEXT:    v_mov_b32_e32 v1, s5
4212; SI-NEXT:    v_mov_b32_e32 v2, s6
4213; SI-NEXT:    v_mov_b32_e32 v3, s7
4214; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4215; SI-NEXT:    s_endpgm
4216;
4217; VI-LABEL: v16i32_arg:
4218; VI:       ; %bb.0: ; %entry
4219; VI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
4220; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4221; VI-NEXT:    s_waitcnt lgkmcnt(0)
4222; VI-NEXT:    v_mov_b32_e32 v0, s16
4223; VI-NEXT:    s_add_u32 s2, s0, 48
4224; VI-NEXT:    s_addc_u32 s3, s1, 0
4225; VI-NEXT:    v_mov_b32_e32 v5, s3
4226; VI-NEXT:    v_mov_b32_e32 v4, s2
4227; VI-NEXT:    s_add_u32 s2, s0, 32
4228; VI-NEXT:    v_mov_b32_e32 v1, s17
4229; VI-NEXT:    v_mov_b32_e32 v2, s18
4230; VI-NEXT:    v_mov_b32_e32 v3, s19
4231; VI-NEXT:    s_addc_u32 s3, s1, 0
4232; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4233; VI-NEXT:    v_mov_b32_e32 v5, s3
4234; VI-NEXT:    v_mov_b32_e32 v4, s2
4235; VI-NEXT:    s_add_u32 s2, s0, 16
4236; VI-NEXT:    v_mov_b32_e32 v0, s12
4237; VI-NEXT:    v_mov_b32_e32 v1, s13
4238; VI-NEXT:    v_mov_b32_e32 v2, s14
4239; VI-NEXT:    v_mov_b32_e32 v3, s15
4240; VI-NEXT:    s_addc_u32 s3, s1, 0
4241; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4242; VI-NEXT:    v_mov_b32_e32 v5, s3
4243; VI-NEXT:    v_mov_b32_e32 v0, s8
4244; VI-NEXT:    v_mov_b32_e32 v1, s9
4245; VI-NEXT:    v_mov_b32_e32 v2, s10
4246; VI-NEXT:    v_mov_b32_e32 v3, s11
4247; VI-NEXT:    v_mov_b32_e32 v4, s2
4248; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4249; VI-NEXT:    v_mov_b32_e32 v5, s1
4250; VI-NEXT:    v_mov_b32_e32 v0, s4
4251; VI-NEXT:    v_mov_b32_e32 v1, s5
4252; VI-NEXT:    v_mov_b32_e32 v2, s6
4253; VI-NEXT:    v_mov_b32_e32 v3, s7
4254; VI-NEXT:    v_mov_b32_e32 v4, s0
4255; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4256; VI-NEXT:    s_endpgm
4257;
4258; GFX9-LABEL: v16i32_arg:
4259; GFX9:       ; %bb.0: ; %entry
4260; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
4261; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4262; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4263; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4264; GFX9-NEXT:    v_mov_b32_e32 v0, s20
4265; GFX9-NEXT:    v_mov_b32_e32 v1, s21
4266; GFX9-NEXT:    v_mov_b32_e32 v2, s22
4267; GFX9-NEXT:    v_mov_b32_e32 v3, s23
4268; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4269; GFX9-NEXT:    s_nop 0
4270; GFX9-NEXT:    v_mov_b32_e32 v0, s16
4271; GFX9-NEXT:    v_mov_b32_e32 v1, s17
4272; GFX9-NEXT:    v_mov_b32_e32 v2, s18
4273; GFX9-NEXT:    v_mov_b32_e32 v3, s19
4274; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4275; GFX9-NEXT:    s_nop 0
4276; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4277; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4278; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4279; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4280; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4281; GFX9-NEXT:    s_nop 0
4282; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4283; GFX9-NEXT:    v_mov_b32_e32 v1, s9
4284; GFX9-NEXT:    v_mov_b32_e32 v2, s10
4285; GFX9-NEXT:    v_mov_b32_e32 v3, s11
4286; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
4287; GFX9-NEXT:    s_endpgm
4288;
4289; EG-LABEL: v16i32_arg:
4290; EG:       ; %bb.0: ; %entry
4291; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4292; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4293; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4294; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4295; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4296; EG-NEXT:    CF_END
4297; EG-NEXT:    ALU clause starting at 6:
4298; EG-NEXT:     MOV * T0.W, KC0[7].X,
4299; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4300; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4301; EG-NEXT:     MOV * T1.W, KC0[8].X,
4302; EG-NEXT:     MOV T0.X, KC0[6].Y,
4303; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4304; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4305; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4306; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4307; EG-NEXT:     MOV * T3.W, KC0[9].X,
4308; EG-NEXT:     MOV T1.X, KC0[7].Y,
4309; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4310; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4311; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4312; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4313; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4314; EG-NEXT:     MOV * T5.W, KC0[10].X,
4315; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4316; EG-NEXT:     MOV T3.X, KC0[8].Y,
4317; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4318; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4319; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4320; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4321; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4322; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4323; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4324; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4325; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4326; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4327; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4328;
4329; CM-LABEL: v16i32_arg:
4330; CM:       ; %bb.0: ; %entry
4331; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4332; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4333; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4334; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4335; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4336; CM-NEXT:    CF_END
4337; CM-NEXT:    ALU clause starting at 6:
4338; CM-NEXT:     MOV * T0.W, KC0[10].X,
4339; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4340; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4341; CM-NEXT:     MOV T0.X, KC0[9].Y,
4342; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4343; CM-NEXT:     MOV * T2.W, KC0[9].X,
4344; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4345; CM-NEXT:     MOV T2.Z, KC0[8].W,
4346; CM-NEXT:     MOV * T1.W, KC0[8].X,
4347; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4348; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4349; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4350; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4351; CM-NEXT:     MOV T2.X, KC0[8].Y,
4352; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4353; CM-NEXT:     MOV T1.X, KC0[7].Y,
4354; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4355; CM-NEXT:     MOV * T4.W, KC0[7].X,
4356; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4357; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4358; CM-NEXT:     MOV T4.Z, KC0[6].W,
4359; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4360; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4361; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4362; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4363; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4364; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4365; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4366; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4367entry:
4368  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
4369  ret void
4370}
4371
4372define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
4373; SI-LABEL: v16f32_arg:
4374; SI:       ; %bb.0: ; %entry
4375; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
4376; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4377; SI-NEXT:    s_mov_b32 s3, 0xf000
4378; SI-NEXT:    s_mov_b32 s2, -1
4379; SI-NEXT:    s_waitcnt lgkmcnt(0)
4380; SI-NEXT:    v_mov_b32_e32 v0, s16
4381; SI-NEXT:    v_mov_b32_e32 v1, s17
4382; SI-NEXT:    v_mov_b32_e32 v2, s18
4383; SI-NEXT:    v_mov_b32_e32 v3, s19
4384; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4385; SI-NEXT:    s_waitcnt expcnt(0)
4386; SI-NEXT:    v_mov_b32_e32 v0, s12
4387; SI-NEXT:    v_mov_b32_e32 v1, s13
4388; SI-NEXT:    v_mov_b32_e32 v2, s14
4389; SI-NEXT:    v_mov_b32_e32 v3, s15
4390; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4391; SI-NEXT:    s_waitcnt expcnt(0)
4392; SI-NEXT:    v_mov_b32_e32 v0, s8
4393; SI-NEXT:    v_mov_b32_e32 v1, s9
4394; SI-NEXT:    v_mov_b32_e32 v2, s10
4395; SI-NEXT:    v_mov_b32_e32 v3, s11
4396; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4397; SI-NEXT:    s_waitcnt expcnt(0)
4398; SI-NEXT:    v_mov_b32_e32 v0, s4
4399; SI-NEXT:    v_mov_b32_e32 v1, s5
4400; SI-NEXT:    v_mov_b32_e32 v2, s6
4401; SI-NEXT:    v_mov_b32_e32 v3, s7
4402; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4403; SI-NEXT:    s_endpgm
4404;
4405; VI-LABEL: v16f32_arg:
4406; VI:       ; %bb.0: ; %entry
4407; VI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
4408; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4409; VI-NEXT:    s_waitcnt lgkmcnt(0)
4410; VI-NEXT:    v_mov_b32_e32 v0, s16
4411; VI-NEXT:    s_add_u32 s2, s0, 48
4412; VI-NEXT:    s_addc_u32 s3, s1, 0
4413; VI-NEXT:    v_mov_b32_e32 v5, s3
4414; VI-NEXT:    v_mov_b32_e32 v4, s2
4415; VI-NEXT:    s_add_u32 s2, s0, 32
4416; VI-NEXT:    v_mov_b32_e32 v1, s17
4417; VI-NEXT:    v_mov_b32_e32 v2, s18
4418; VI-NEXT:    v_mov_b32_e32 v3, s19
4419; VI-NEXT:    s_addc_u32 s3, s1, 0
4420; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4421; VI-NEXT:    v_mov_b32_e32 v5, s3
4422; VI-NEXT:    v_mov_b32_e32 v4, s2
4423; VI-NEXT:    s_add_u32 s2, s0, 16
4424; VI-NEXT:    v_mov_b32_e32 v0, s12
4425; VI-NEXT:    v_mov_b32_e32 v1, s13
4426; VI-NEXT:    v_mov_b32_e32 v2, s14
4427; VI-NEXT:    v_mov_b32_e32 v3, s15
4428; VI-NEXT:    s_addc_u32 s3, s1, 0
4429; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4430; VI-NEXT:    v_mov_b32_e32 v5, s3
4431; VI-NEXT:    v_mov_b32_e32 v0, s8
4432; VI-NEXT:    v_mov_b32_e32 v1, s9
4433; VI-NEXT:    v_mov_b32_e32 v2, s10
4434; VI-NEXT:    v_mov_b32_e32 v3, s11
4435; VI-NEXT:    v_mov_b32_e32 v4, s2
4436; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4437; VI-NEXT:    v_mov_b32_e32 v5, s1
4438; VI-NEXT:    v_mov_b32_e32 v0, s4
4439; VI-NEXT:    v_mov_b32_e32 v1, s5
4440; VI-NEXT:    v_mov_b32_e32 v2, s6
4441; VI-NEXT:    v_mov_b32_e32 v3, s7
4442; VI-NEXT:    v_mov_b32_e32 v4, s0
4443; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4444; VI-NEXT:    s_endpgm
4445;
4446; GFX9-LABEL: v16f32_arg:
4447; GFX9:       ; %bb.0: ; %entry
4448; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
4449; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4450; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4451; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4452; GFX9-NEXT:    v_mov_b32_e32 v0, s20
4453; GFX9-NEXT:    v_mov_b32_e32 v1, s21
4454; GFX9-NEXT:    v_mov_b32_e32 v2, s22
4455; GFX9-NEXT:    v_mov_b32_e32 v3, s23
4456; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
4457; GFX9-NEXT:    s_nop 0
4458; GFX9-NEXT:    v_mov_b32_e32 v0, s16
4459; GFX9-NEXT:    v_mov_b32_e32 v1, s17
4460; GFX9-NEXT:    v_mov_b32_e32 v2, s18
4461; GFX9-NEXT:    v_mov_b32_e32 v3, s19
4462; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
4463; GFX9-NEXT:    s_nop 0
4464; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4465; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4466; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4467; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4468; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
4469; GFX9-NEXT:    s_nop 0
4470; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4471; GFX9-NEXT:    v_mov_b32_e32 v1, s9
4472; GFX9-NEXT:    v_mov_b32_e32 v2, s10
4473; GFX9-NEXT:    v_mov_b32_e32 v3, s11
4474; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
4475; GFX9-NEXT:    s_endpgm
4476;
4477; EG-LABEL: v16f32_arg:
4478; EG:       ; %bb.0: ; %entry
4479; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4480; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4481; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4482; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4483; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4484; EG-NEXT:    CF_END
4485; EG-NEXT:    ALU clause starting at 6:
4486; EG-NEXT:     MOV * T0.W, KC0[7].X,
4487; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4488; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4489; EG-NEXT:     MOV * T1.W, KC0[8].X,
4490; EG-NEXT:     MOV T0.X, KC0[6].Y,
4491; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4492; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4493; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4494; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4495; EG-NEXT:     MOV * T3.W, KC0[9].X,
4496; EG-NEXT:     MOV T1.X, KC0[7].Y,
4497; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4498; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4499; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4500; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4501; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4502; EG-NEXT:     MOV * T5.W, KC0[10].X,
4503; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4504; EG-NEXT:     MOV T3.X, KC0[8].Y,
4505; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4506; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4507; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4508; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4509; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4510; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4511; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4512; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4513; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4514; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4515; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4516;
4517; CM-LABEL: v16f32_arg:
4518; CM:       ; %bb.0: ; %entry
4519; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4520; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4521; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4522; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4523; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4524; CM-NEXT:    CF_END
4525; CM-NEXT:    ALU clause starting at 6:
4526; CM-NEXT:     MOV * T0.W, KC0[10].X,
4527; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4528; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4529; CM-NEXT:     MOV T0.X, KC0[9].Y,
4530; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4531; CM-NEXT:     MOV * T2.W, KC0[9].X,
4532; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4533; CM-NEXT:     MOV T2.Z, KC0[8].W,
4534; CM-NEXT:     MOV * T1.W, KC0[8].X,
4535; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4536; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4537; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4538; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4539; CM-NEXT:     MOV T2.X, KC0[8].Y,
4540; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4541; CM-NEXT:     MOV T1.X, KC0[7].Y,
4542; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4543; CM-NEXT:     MOV * T4.W, KC0[7].X,
4544; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4545; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4546; CM-NEXT:     MOV T4.Z, KC0[6].W,
4547; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4548; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4549; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4550; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4551; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4552; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4553; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4554; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4555entry:
4556  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
4557  ret void
4558}
4559
4560define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
4561; SI-LABEL: kernel_arg_i64:
4562; SI:       ; %bb.0:
4563; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4564; SI-NEXT:    s_mov_b32 s7, 0xf000
4565; SI-NEXT:    s_mov_b32 s6, -1
4566; SI-NEXT:    s_waitcnt lgkmcnt(0)
4567; SI-NEXT:    s_mov_b32 s4, s0
4568; SI-NEXT:    s_mov_b32 s5, s1
4569; SI-NEXT:    v_mov_b32_e32 v0, s2
4570; SI-NEXT:    v_mov_b32_e32 v1, s3
4571; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4572; SI-NEXT:    s_endpgm
4573;
4574; VI-LABEL: kernel_arg_i64:
4575; VI:       ; %bb.0:
4576; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
4577; VI-NEXT:    s_waitcnt lgkmcnt(0)
4578; VI-NEXT:    v_mov_b32_e32 v0, s0
4579; VI-NEXT:    v_mov_b32_e32 v1, s1
4580; VI-NEXT:    v_mov_b32_e32 v2, s2
4581; VI-NEXT:    v_mov_b32_e32 v3, s3
4582; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4583; VI-NEXT:    s_endpgm
4584;
4585; GFX9-LABEL: kernel_arg_i64:
4586; GFX9:       ; %bb.0:
4587; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4588; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4589; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4590; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4591; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4592; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4593; GFX9-NEXT:    s_endpgm
4594;
4595; EG-LABEL: kernel_arg_i64:
4596; EG:       ; %bb.0:
4597; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4598; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4599; EG-NEXT:    CF_END
4600; EG-NEXT:    PAD
4601; EG-NEXT:    ALU clause starting at 4:
4602; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4603; EG-NEXT:     MOV T0.X, KC0[2].W,
4604; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4605; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4606;
4607; CM-LABEL: kernel_arg_i64:
4608; CM:       ; %bb.0:
4609; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4610; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4611; CM-NEXT:    CF_END
4612; CM-NEXT:    PAD
4613; CM-NEXT:    ALU clause starting at 4:
4614; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4615; CM-NEXT:     MOV * T0.X, KC0[2].W,
4616; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4617; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4618  store i64 %a, i64 addrspace(1)* %out, align 8
4619  ret void
4620}
4621
4622define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
4623; SI-LABEL: f64_kernel_arg:
4624; SI:       ; %bb.0: ; %entry
4625; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4626; SI-NEXT:    s_mov_b32 s7, 0xf000
4627; SI-NEXT:    s_mov_b32 s6, -1
4628; SI-NEXT:    s_waitcnt lgkmcnt(0)
4629; SI-NEXT:    s_mov_b32 s4, s0
4630; SI-NEXT:    s_mov_b32 s5, s1
4631; SI-NEXT:    v_mov_b32_e32 v0, s2
4632; SI-NEXT:    v_mov_b32_e32 v1, s3
4633; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4634; SI-NEXT:    s_endpgm
4635;
4636; VI-LABEL: f64_kernel_arg:
4637; VI:       ; %bb.0: ; %entry
4638; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
4639; VI-NEXT:    s_waitcnt lgkmcnt(0)
4640; VI-NEXT:    v_mov_b32_e32 v0, s0
4641; VI-NEXT:    v_mov_b32_e32 v1, s1
4642; VI-NEXT:    v_mov_b32_e32 v2, s2
4643; VI-NEXT:    v_mov_b32_e32 v3, s3
4644; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4645; VI-NEXT:    s_endpgm
4646;
4647; GFX9-LABEL: f64_kernel_arg:
4648; GFX9:       ; %bb.0: ; %entry
4649; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4650; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4651; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4652; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4653; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4654; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4655; GFX9-NEXT:    s_endpgm
4656;
4657; EG-LABEL: f64_kernel_arg:
4658; EG:       ; %bb.0: ; %entry
4659; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4661; EG-NEXT:    CF_END
4662; EG-NEXT:    PAD
4663; EG-NEXT:    ALU clause starting at 4:
4664; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4665; EG-NEXT:     MOV T0.X, KC0[2].W,
4666; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4667; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4668;
4669; CM-LABEL: f64_kernel_arg:
4670; CM:       ; %bb.0: ; %entry
4671; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4672; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4673; CM-NEXT:    CF_END
4674; CM-NEXT:    PAD
4675; CM-NEXT:    ALU clause starting at 4:
4676; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4677; CM-NEXT:     MOV * T0.X, KC0[2].W,
4678; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4679; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4680entry:
4681  store double %in, double addrspace(1)* %out
4682  ret void
4683}
4684
4685; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
4686; XGCN: s_load_dwordx2
4687; XGCN: s_load_dwordx2
4688; XGCN: buffer_store_dwordx2
4689; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
4690;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
4691;   ret void
4692; }
4693
4694define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
4695; SI-LABEL: i65_arg:
4696; SI:       ; %bb.0: ; %entry
4697; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
4698; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
4699; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4700; SI-NEXT:    s_mov_b32 s3, 0xf000
4701; SI-NEXT:    s_waitcnt lgkmcnt(0)
4702; SI-NEXT:    s_and_b32 s6, s2, 1
4703; SI-NEXT:    s_mov_b32 s2, -1
4704; SI-NEXT:    v_mov_b32_e32 v0, s4
4705; SI-NEXT:    v_mov_b32_e32 v1, s5
4706; SI-NEXT:    v_mov_b32_e32 v2, s6
4707; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:8
4708; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4709; SI-NEXT:    s_endpgm
4710;
4711; VI-LABEL: i65_arg:
4712; VI:       ; %bb.0: ; %entry
4713; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
4714; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4715; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
4716; VI-NEXT:    s_waitcnt lgkmcnt(0)
4717; VI-NEXT:    s_and_b32 s4, s4, 1
4718; VI-NEXT:    v_mov_b32_e32 v0, s2
4719; VI-NEXT:    v_mov_b32_e32 v1, s3
4720; VI-NEXT:    s_add_u32 s2, s2, 8
4721; VI-NEXT:    s_addc_u32 s3, s3, 0
4722; VI-NEXT:    v_mov_b32_e32 v2, s2
4723; VI-NEXT:    v_mov_b32_e32 v4, s4
4724; VI-NEXT:    v_mov_b32_e32 v3, s3
4725; VI-NEXT:    flat_store_byte v[2:3], v4
4726; VI-NEXT:    v_mov_b32_e32 v3, s1
4727; VI-NEXT:    v_mov_b32_e32 v2, s0
4728; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4729; VI-NEXT:    s_endpgm
4730;
4731; GFX9-LABEL: i65_arg:
4732; GFX9:       ; %bb.0: ; %entry
4733; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
4734; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
4735; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
4736; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4737; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4738; GFX9-NEXT:    s_and_b32 s4, s6, 1
4739; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4740; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4741; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4742; GFX9-NEXT:    global_store_byte v2, v3, s[2:3] offset:8
4743; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
4744; GFX9-NEXT:    s_endpgm
4745;
4746; EG-LABEL: i65_arg:
4747; EG:       ; %bb.0: ; %entry
4748; EG-NEXT:    ALU 20, @6, KC0[CB0:0-32], KC1[]
4749; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
4750; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
4751; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
4752; EG-NEXT:    CF_END
4753; EG-NEXT:    PAD
4754; EG-NEXT:    ALU clause starting at 6:
4755; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4756; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4757; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4758; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4759; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
4760; EG-NEXT:     AND_INT * T2.W, KC0[3].Y, 1,
4761; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4762; EG-NEXT:     LSHL T1.X, PS, PV.W,
4763; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
4764; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4765; EG-NEXT:     MOV T1.Y, 0.0,
4766; EG-NEXT:     MOV * T1.Z, 0.0,
4767; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
4768; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4769; EG-NEXT:    2(2.802597e-45), 4(5.605194e-45)
4770; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
4771; EG-NEXT:     MOV * T3.X, KC0[3].X,
4772; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4773; EG-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
4774; EG-NEXT:     MOV * T5.X, KC0[2].W,
4775; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4776;
4777; CM-LABEL: i65_arg:
4778; CM:       ; %bb.0: ; %entry
4779; CM-NEXT:    ALU 21, @6, KC0[CB0:0-32], KC1[]
4780; CM-NEXT:    MEM_RAT MSKOR T1.XW, T5.X
4781; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
4782; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
4783; CM-NEXT:    CF_END
4784; CM-NEXT:    PAD
4785; CM-NEXT:    ALU clause starting at 6:
4786; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4787; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4788; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4789; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4790; CM-NEXT:     LSHL T0.Z, PV.W, literal.x,
4791; CM-NEXT:     AND_INT * T1.W, KC0[3].Y, 1,
4792; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4793; CM-NEXT:     LSHL T1.X, PV.W, PV.Z,
4794; CM-NEXT:     LSHL * T1.W, literal.x, PV.Z,
4795; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4796; CM-NEXT:     MOV T1.Y, 0.0,
4797; CM-NEXT:     MOV * T1.Z, 0.0,
4798; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
4799; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4800; CM-NEXT:     MOV T2.X, KC0[2].W,
4801; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4802; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
4803; CM-NEXT:     LSHR * T3.X, PV.W, literal.x,
4804; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4805; CM-NEXT:     MOV * T4.X, KC0[3].X,
4806; CM-NEXT:     LSHR * T5.X, T0.W, literal.x,
4807; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4808entry:
4809  store i65 %in, i65 addrspace(1)* %out, align 4
4810  ret void
4811}
4812
4813define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
4814; SI-LABEL: i1_arg:
4815; SI:       ; %bb.0:
4816; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
4817; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4818; SI-NEXT:    s_mov_b32 s3, 0xf000
4819; SI-NEXT:    s_waitcnt lgkmcnt(0)
4820; SI-NEXT:    s_and_b32 s4, s2, 1
4821; SI-NEXT:    s_mov_b32 s2, -1
4822; SI-NEXT:    v_mov_b32_e32 v0, s4
4823; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
4824; SI-NEXT:    s_endpgm
4825;
4826; VI-LABEL: i1_arg:
4827; VI:       ; %bb.0:
4828; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
4829; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4830; VI-NEXT:    s_waitcnt lgkmcnt(0)
4831; VI-NEXT:    s_and_b32 s2, s2, 1
4832; VI-NEXT:    v_mov_b32_e32 v0, s0
4833; VI-NEXT:    v_mov_b32_e32 v1, s1
4834; VI-NEXT:    v_mov_b32_e32 v2, s2
4835; VI-NEXT:    flat_store_byte v[0:1], v2
4836; VI-NEXT:    s_endpgm
4837;
4838; GFX9-LABEL: i1_arg:
4839; GFX9:       ; %bb.0:
4840; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
4841; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4842; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4843; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4844; GFX9-NEXT:    s_and_b32 s2, s2, 1
4845; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4846; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
4847; GFX9-NEXT:    s_endpgm
4848;
4849; EG-LABEL: i1_arg:
4850; EG:       ; %bb.0:
4851; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4852; EG-NEXT:    TEX 0 @6
4853; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4854; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4855; EG-NEXT:    CF_END
4856; EG-NEXT:    PAD
4857; EG-NEXT:    Fetch clause starting at 6:
4858; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4859; EG-NEXT:    ALU clause starting at 8:
4860; EG-NEXT:     MOV * T0.X, 0.0,
4861; EG-NEXT:    ALU clause starting at 9:
4862; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
4863; EG-NEXT:     AND_INT * T1.W, T0.X, 1,
4864; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4865; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
4866; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4867; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
4868; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
4869; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4870; EG-NEXT:     MOV T0.Y, 0.0,
4871; EG-NEXT:     MOV * T0.Z, 0.0,
4872; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4873; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4874;
4875; CM-LABEL: i1_arg:
4876; CM:       ; %bb.0:
4877; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4878; CM-NEXT:    TEX 0 @6
4879; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4880; CM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4881; CM-NEXT:    CF_END
4882; CM-NEXT:    PAD
4883; CM-NEXT:    Fetch clause starting at 6:
4884; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4885; CM-NEXT:    ALU clause starting at 8:
4886; CM-NEXT:     MOV * T0.X, 0.0,
4887; CM-NEXT:    ALU clause starting at 9:
4888; CM-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
4889; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4890; CM-NEXT:     AND_INT T0.Z, T0.X, 1,
4891; CM-NEXT:     LSHL * T0.W, PV.W, literal.x,
4892; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4893; CM-NEXT:     LSHL T0.X, PV.Z, PV.W,
4894; CM-NEXT:     LSHL * T0.W, literal.x, PV.W,
4895; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4896; CM-NEXT:     MOV T0.Y, 0.0,
4897; CM-NEXT:     MOV * T0.Z, 0.0,
4898; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4899; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4900  store i1 %x, i1 addrspace(1)* %out, align 1
4901  ret void
4902}
4903
4904define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
4905; SI-LABEL: i1_arg_zext_i32:
4906; SI:       ; %bb.0:
4907; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
4908; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4909; SI-NEXT:    s_mov_b32 s3, 0xf000
4910; SI-NEXT:    s_waitcnt lgkmcnt(0)
4911; SI-NEXT:    s_and_b32 s4, s2, 1
4912; SI-NEXT:    s_mov_b32 s2, -1
4913; SI-NEXT:    v_mov_b32_e32 v0, s4
4914; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4915; SI-NEXT:    s_endpgm
4916;
4917; VI-LABEL: i1_arg_zext_i32:
4918; VI:       ; %bb.0:
4919; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
4920; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4921; VI-NEXT:    s_waitcnt lgkmcnt(0)
4922; VI-NEXT:    s_and_b32 s2, s2, 1
4923; VI-NEXT:    v_mov_b32_e32 v0, s0
4924; VI-NEXT:    v_mov_b32_e32 v1, s1
4925; VI-NEXT:    v_mov_b32_e32 v2, s2
4926; VI-NEXT:    flat_store_dword v[0:1], v2
4927; VI-NEXT:    s_endpgm
4928;
4929; GFX9-LABEL: i1_arg_zext_i32:
4930; GFX9:       ; %bb.0:
4931; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
4932; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4933; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4934; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4935; GFX9-NEXT:    s_and_b32 s2, s2, 1
4936; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4937; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4938; GFX9-NEXT:    s_endpgm
4939;
4940; EG-LABEL: i1_arg_zext_i32:
4941; EG:       ; %bb.0:
4942; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4943; EG-NEXT:    TEX 0 @6
4944; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4945; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4946; EG-NEXT:    CF_END
4947; EG-NEXT:    PAD
4948; EG-NEXT:    Fetch clause starting at 6:
4949; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4950; EG-NEXT:    ALU clause starting at 8:
4951; EG-NEXT:     MOV * T0.X, 0.0,
4952; EG-NEXT:    ALU clause starting at 9:
4953; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4954; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4955;
4956; CM-LABEL: i1_arg_zext_i32:
4957; CM:       ; %bb.0:
4958; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4959; CM-NEXT:    TEX 0 @6
4960; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4961; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4962; CM-NEXT:    CF_END
4963; CM-NEXT:    PAD
4964; CM-NEXT:    Fetch clause starting at 6:
4965; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4966; CM-NEXT:    ALU clause starting at 8:
4967; CM-NEXT:     MOV * T0.X, 0.0,
4968; CM-NEXT:    ALU clause starting at 9:
4969; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4970; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4971  %ext = zext i1 %x to i32
4972  store i32 %ext, i32 addrspace(1)* %out, align 4
4973  ret void
4974}
4975
4976define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
4977; SI-LABEL: i1_arg_zext_i64:
4978; SI:       ; %bb.0:
4979; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
4980; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4981; SI-NEXT:    s_mov_b32 s3, 0xf000
4982; SI-NEXT:    s_mov_b32 s2, -1
4983; SI-NEXT:    s_waitcnt lgkmcnt(0)
4984; SI-NEXT:    s_and_b32 s4, s4, 1
4985; SI-NEXT:    v_mov_b32_e32 v1, 0
4986; SI-NEXT:    v_mov_b32_e32 v0, s4
4987; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4988; SI-NEXT:    s_endpgm
4989;
4990; VI-LABEL: i1_arg_zext_i64:
4991; VI:       ; %bb.0:
4992; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
4993; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
4994; VI-NEXT:    v_mov_b32_e32 v1, 0
4995; VI-NEXT:    s_waitcnt lgkmcnt(0)
4996; VI-NEXT:    s_and_b32 s2, s2, 1
4997; VI-NEXT:    v_mov_b32_e32 v3, s1
4998; VI-NEXT:    v_mov_b32_e32 v0, s2
4999; VI-NEXT:    v_mov_b32_e32 v2, s0
5000; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5001; VI-NEXT:    s_endpgm
5002;
5003; GFX9-LABEL: i1_arg_zext_i64:
5004; GFX9:       ; %bb.0:
5005; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5006; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5007; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5009; GFX9-NEXT:    s_and_b32 s2, s2, 1
5010; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5011; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
5012; GFX9-NEXT:    s_endpgm
5013;
5014; EG-LABEL: i1_arg_zext_i64:
5015; EG:       ; %bb.0:
5016; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5017; EG-NEXT:    TEX 0 @6
5018; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5019; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5020; EG-NEXT:    CF_END
5021; EG-NEXT:    PAD
5022; EG-NEXT:    Fetch clause starting at 6:
5023; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5024; EG-NEXT:    ALU clause starting at 8:
5025; EG-NEXT:     MOV * T0.X, 0.0,
5026; EG-NEXT:    ALU clause starting at 9:
5027; EG-NEXT:     MOV * T0.Y, 0.0,
5028; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5029; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5030;
5031; CM-LABEL: i1_arg_zext_i64:
5032; CM:       ; %bb.0:
5033; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5034; CM-NEXT:    TEX 0 @6
5035; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5036; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5037; CM-NEXT:    CF_END
5038; CM-NEXT:    PAD
5039; CM-NEXT:    Fetch clause starting at 6:
5040; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5041; CM-NEXT:    ALU clause starting at 8:
5042; CM-NEXT:     MOV * T0.X, 0.0,
5043; CM-NEXT:    ALU clause starting at 9:
5044; CM-NEXT:     MOV * T0.Y, 0.0,
5045; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5046; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5047  %ext = zext i1 %x to i64
5048  store i64 %ext, i64 addrspace(1)* %out, align 8
5049  ret void
5050}
5051
5052define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
5053; SI-LABEL: i1_arg_sext_i32:
5054; SI:       ; %bb.0:
5055; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
5056; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5057; SI-NEXT:    s_mov_b32 s3, 0xf000
5058; SI-NEXT:    s_waitcnt lgkmcnt(0)
5059; SI-NEXT:    s_bfe_i32 s4, s2, 0x10000
5060; SI-NEXT:    s_mov_b32 s2, -1
5061; SI-NEXT:    v_mov_b32_e32 v0, s4
5062; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5063; SI-NEXT:    s_endpgm
5064;
5065; VI-LABEL: i1_arg_sext_i32:
5066; VI:       ; %bb.0:
5067; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
5068; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5069; VI-NEXT:    s_waitcnt lgkmcnt(0)
5070; VI-NEXT:    s_bfe_i32 s2, s2, 0x10000
5071; VI-NEXT:    v_mov_b32_e32 v0, s0
5072; VI-NEXT:    v_mov_b32_e32 v1, s1
5073; VI-NEXT:    v_mov_b32_e32 v2, s2
5074; VI-NEXT:    flat_store_dword v[0:1], v2
5075; VI-NEXT:    s_endpgm
5076;
5077; GFX9-LABEL: i1_arg_sext_i32:
5078; GFX9:       ; %bb.0:
5079; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5080; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5081; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5082; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5083; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x10000
5084; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5085; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5086; GFX9-NEXT:    s_endpgm
5087;
5088; EG-LABEL: i1_arg_sext_i32:
5089; EG:       ; %bb.0:
5090; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5091; EG-NEXT:    TEX 0 @6
5092; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5093; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
5094; EG-NEXT:    CF_END
5095; EG-NEXT:    PAD
5096; EG-NEXT:    Fetch clause starting at 6:
5097; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5098; EG-NEXT:    ALU clause starting at 8:
5099; EG-NEXT:     MOV * T0.X, 0.0,
5100; EG-NEXT:    ALU clause starting at 9:
5101; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
5102; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5103; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5104;
5105; CM-LABEL: i1_arg_sext_i32:
5106; CM:       ; %bb.0:
5107; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5108; CM-NEXT:    TEX 0 @6
5109; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5110; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
5111; CM-NEXT:    CF_END
5112; CM-NEXT:    PAD
5113; CM-NEXT:    Fetch clause starting at 6:
5114; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5115; CM-NEXT:    ALU clause starting at 8:
5116; CM-NEXT:     MOV * T0.X, 0.0,
5117; CM-NEXT:    ALU clause starting at 9:
5118; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
5119; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5120; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5121  %ext = sext i1 %x to i32
5122  store i32 %ext, i32addrspace(1)* %out, align 4
5123  ret void
5124}
5125
5126define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
5127; SI-LABEL: i1_arg_sext_i64:
5128; SI:       ; %bb.0:
5129; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
5130; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5131; SI-NEXT:    s_mov_b32 s3, 0xf000
5132; SI-NEXT:    s_waitcnt lgkmcnt(0)
5133; SI-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
5134; SI-NEXT:    s_mov_b32 s2, -1
5135; SI-NEXT:    v_mov_b32_e32 v0, s4
5136; SI-NEXT:    v_mov_b32_e32 v1, s5
5137; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5138; SI-NEXT:    s_endpgm
5139;
5140; VI-LABEL: i1_arg_sext_i64:
5141; VI:       ; %bb.0:
5142; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
5143; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
5144; VI-NEXT:    s_waitcnt lgkmcnt(0)
5145; VI-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
5146; VI-NEXT:    v_mov_b32_e32 v0, s0
5147; VI-NEXT:    v_mov_b32_e32 v2, s2
5148; VI-NEXT:    v_mov_b32_e32 v1, s1
5149; VI-NEXT:    v_mov_b32_e32 v3, s3
5150; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5151; VI-NEXT:    s_endpgm
5152;
5153; GFX9-LABEL: i1_arg_sext_i64:
5154; GFX9:       ; %bb.0:
5155; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x8
5156; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
5157; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5158; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5159; GFX9-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x10000
5160; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5161; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5162; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
5163; GFX9-NEXT:    s_endpgm
5164;
5165; EG-LABEL: i1_arg_sext_i64:
5166; EG:       ; %bb.0:
5167; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5168; EG-NEXT:    TEX 0 @6
5169; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
5170; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5171; EG-NEXT:    CF_END
5172; EG-NEXT:    PAD
5173; EG-NEXT:    Fetch clause starting at 6:
5174; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5175; EG-NEXT:    ALU clause starting at 8:
5176; EG-NEXT:     MOV * T0.X, 0.0,
5177; EG-NEXT:    ALU clause starting at 9:
5178; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
5179; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5180; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5181; EG-NEXT:     MOV * T0.Y, PV.X,
5182;
5183; CM-LABEL: i1_arg_sext_i64:
5184; CM:       ; %bb.0:
5185; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5186; CM-NEXT:    TEX 0 @6
5187; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
5188; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5189; CM-NEXT:    CF_END
5190; CM-NEXT:    PAD
5191; CM-NEXT:    Fetch clause starting at 6:
5192; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5193; CM-NEXT:    ALU clause starting at 8:
5194; CM-NEXT:     MOV * T0.X, 0.0,
5195; CM-NEXT:    ALU clause starting at 9:
5196; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
5197; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
5198; CM-NEXT:     MOV * T0.Y, PV.X,
5199; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5200  %ext = sext i1 %x to i64
5201  store i64 %ext, i64 addrspace(1)* %out, align 8
5202  ret void
5203}
5204
5205define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
5206; SI-LABEL: empty_struct_arg:
5207; SI:       ; %bb.0:
5208; SI-NEXT:    s_endpgm
5209;
5210; VI-LABEL: empty_struct_arg:
5211; VI:       ; %bb.0:
5212; VI-NEXT:    s_endpgm
5213;
5214; GFX9-LABEL: empty_struct_arg:
5215; GFX9:       ; %bb.0:
5216; GFX9-NEXT:    s_endpgm
5217;
5218; EGCM-LABEL: empty_struct_arg:
5219; EGCM:       ; %bb.0:
5220; EGCM-NEXT:    CF_END
5221; EGCM-NEXT:    PAD
5222  ret void
5223}
5224
5225; The correct load offsets for these:
5226; load 4 from 0,
5227; load 8 from 8
5228; load 4 from 24
5229; load 8 from 32
5230
5231; With the SelectionDAG argument lowering, the alignments for the
5232; struct members is not properly considered, making these wrong.
5233
5234; FIXME: Total argument size is computed wrong
5235define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
5236; SI-LABEL: struct_argument_alignment:
5237; SI:       ; %bb.0:
5238; SI-NEXT:    s_load_dword s8, s[0:1], 0x9
5239; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5240; SI-NEXT:    s_load_dword s9, s[0:1], 0xf
5241; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x11
5242; SI-NEXT:    s_mov_b32 s0, 0
5243; SI-NEXT:    s_mov_b32 s3, 0xf000
5244; SI-NEXT:    s_mov_b32 s2, -1
5245; SI-NEXT:    s_mov_b32 s1, s0
5246; SI-NEXT:    s_waitcnt lgkmcnt(0)
5247; SI-NEXT:    v_mov_b32_e32 v0, s8
5248; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5249; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5250; SI-NEXT:    v_mov_b32_e32 v0, s4
5251; SI-NEXT:    v_mov_b32_e32 v1, s5
5252; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5253; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5254; SI-NEXT:    v_mov_b32_e32 v0, s9
5255; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5256; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5257; SI-NEXT:    v_mov_b32_e32 v0, s6
5258; SI-NEXT:    v_mov_b32_e32 v1, s7
5259; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5260; SI-NEXT:    s_waitcnt vmcnt(0)
5261; SI-NEXT:    s_endpgm
5262;
5263; VI-LABEL: struct_argument_alignment:
5264; VI:       ; %bb.0:
5265; VI-NEXT:    s_load_dword s4, s[0:1], 0x24
5266; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5267; VI-NEXT:    s_load_dword s5, s[0:1], 0x3c
5268; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
5269; VI-NEXT:    v_mov_b32_e32 v0, 0
5270; VI-NEXT:    v_mov_b32_e32 v1, 0
5271; VI-NEXT:    s_waitcnt lgkmcnt(0)
5272; VI-NEXT:    v_mov_b32_e32 v2, s4
5273; VI-NEXT:    flat_store_dword v[0:1], v2
5274; VI-NEXT:    s_waitcnt vmcnt(0)
5275; VI-NEXT:    v_mov_b32_e32 v2, s2
5276; VI-NEXT:    v_mov_b32_e32 v3, s3
5277; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5278; VI-NEXT:    s_waitcnt vmcnt(0)
5279; VI-NEXT:    v_mov_b32_e32 v2, s5
5280; VI-NEXT:    flat_store_dword v[0:1], v2
5281; VI-NEXT:    s_waitcnt vmcnt(0)
5282; VI-NEXT:    v_mov_b32_e32 v3, s1
5283; VI-NEXT:    v_mov_b32_e32 v2, s0
5284; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5285; VI-NEXT:    s_waitcnt vmcnt(0)
5286; VI-NEXT:    s_endpgm
5287;
5288; GFX9-LABEL: struct_argument_alignment:
5289; GFX9:       ; %bb.0:
5290; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
5291; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5292; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x18
5293; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x20
5294; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5295; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5297; GFX9-NEXT:    v_mov_b32_e32 v2, s6
5298; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5299; GFX9-NEXT:    s_waitcnt vmcnt(0)
5300; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5301; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5302; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5303; GFX9-NEXT:    s_waitcnt vmcnt(0)
5304; GFX9-NEXT:    v_mov_b32_e32 v2, s7
5305; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5306; GFX9-NEXT:    s_waitcnt vmcnt(0)
5307; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5308; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5309; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5310; GFX9-NEXT:    s_waitcnt vmcnt(0)
5311; GFX9-NEXT:    s_endpgm
5312;
5313; EG-LABEL: struct_argument_alignment:
5314; EG:       ; %bb.0:
5315; EG-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5316; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
5317; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
5318; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
5319; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
5320; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
5321; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
5322; EG-NEXT:    CF_END
5323; EG-NEXT:    ALU clause starting at 8:
5324; EG-NEXT:     MOV T0.X, KC0[4].Y,
5325; EG-NEXT:     MOV * T1.X, KC0[4].Z,
5326; EG-NEXT:     MOV T2.X, KC0[3].W,
5327; EG-NEXT:     MOV * T3.X, KC0[2].W,
5328; EG-NEXT:     MOV T4.X, literal.x,
5329; EG-NEXT:     MOV * T5.X, KC0[3].X,
5330; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5331; EG-NEXT:     MOV T6.X, literal.x,
5332; EG-NEXT:     MOV * T7.X, KC0[2].Y,
5333; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5334;
5335; CM-LABEL: struct_argument_alignment:
5336; CM:       ; %bb.0:
5337; CM-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5338; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
5339; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
5340; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
5341; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
5342; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5343; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
5344; CM-NEXT:    CF_END
5345; CM-NEXT:    ALU clause starting at 8:
5346; CM-NEXT:     MOV * T0.X, KC0[4].Y,
5347; CM-NEXT:     MOV * T1.X, KC0[4].Z,
5348; CM-NEXT:     MOV * T2.X, KC0[3].W,
5349; CM-NEXT:     MOV * T3.X, KC0[2].W,
5350; CM-NEXT:     MOV * T4.X, literal.x,
5351; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5352; CM-NEXT:     MOV * T5.X, KC0[3].X,
5353; CM-NEXT:     MOV * T6.X, literal.x,
5354; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5355; CM-NEXT:     MOV * T7.X, KC0[2].Y,
5356  %val0 = extractvalue {i32, i64} %arg0, 0
5357  %val1 = extractvalue {i32, i64} %arg0, 1
5358  %val2 = extractvalue {i32, i64} %arg1, 0
5359  %val3 = extractvalue {i32, i64} %arg1, 1
5360  store volatile i32 %val0, i32 addrspace(1)* null
5361  store volatile i64 %val1, i64 addrspace(1)* null
5362  store volatile i32 %val2, i32 addrspace(1)* null
5363  store volatile i64 %val3, i64 addrspace(1)* null
5364  ret void
5365}
5366
5367; No padding between i8 and next struct, but round up at end to 4 byte
5368; multiple.
5369define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
5370; SI-LABEL: packed_struct_argument_alignment:
5371; SI:       ; %bb.0:
5372; SI-NEXT:    s_mov_b32 s3, 0xf000
5373; SI-NEXT:    s_mov_b32 s2, -1
5374; SI-NEXT:    s_load_dword s6, s[0:1], 0x9
5375; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xa
5376; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:49
5377; SI-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:50
5378; SI-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:51
5379; SI-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:52
5380; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53
5381; SI-NEXT:    s_mov_b32 s0, 0
5382; SI-NEXT:    s_mov_b32 s1, s0
5383; SI-NEXT:    s_waitcnt lgkmcnt(0)
5384; SI-NEXT:    v_mov_b32_e32 v2, s6
5385; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
5386; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5387; SI-NEXT:    v_mov_b32_e32 v2, s4
5388; SI-NEXT:    v_mov_b32_e32 v3, s5
5389; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
5390; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5391; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
5392; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
5393; SI-NEXT:    v_or_b32_e32 v2, v2, v4
5394; SI-NEXT:    v_or_b32_e32 v3, v3, v6
5395; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
5396; SI-NEXT:    v_or_b32_e32 v2, v3, v2
5397; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
5398; SI-NEXT:    s_waitcnt vmcnt(0)
5399; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5400; SI-NEXT:    s_waitcnt vmcnt(0)
5401; SI-NEXT:    s_endpgm
5402;
5403; VI-LABEL: packed_struct_argument_alignment:
5404; VI:       ; %bb.0:
5405; VI-NEXT:    s_add_u32 s2, s0, 49
5406; VI-NEXT:    s_addc_u32 s3, s1, 0
5407; VI-NEXT:    s_add_u32 s4, s0, 50
5408; VI-NEXT:    s_addc_u32 s5, s1, 0
5409; VI-NEXT:    v_mov_b32_e32 v2, s2
5410; VI-NEXT:    v_mov_b32_e32 v3, s3
5411; VI-NEXT:    s_add_u32 s2, s2, 3
5412; VI-NEXT:    s_addc_u32 s3, s3, 0
5413; VI-NEXT:    v_mov_b32_e32 v5, s3
5414; VI-NEXT:    v_mov_b32_e32 v4, s2
5415; VI-NEXT:    s_add_u32 s2, s0, 51
5416; VI-NEXT:    s_addc_u32 s3, s1, 0
5417; VI-NEXT:    v_mov_b32_e32 v0, s4
5418; VI-NEXT:    v_mov_b32_e32 v7, s3
5419; VI-NEXT:    v_mov_b32_e32 v1, s5
5420; VI-NEXT:    v_mov_b32_e32 v6, s2
5421; VI-NEXT:    flat_load_ubyte v8, v[0:1]
5422; VI-NEXT:    flat_load_ubyte v9, v[2:3]
5423; VI-NEXT:    flat_load_ubyte v10, v[4:5]
5424; VI-NEXT:    flat_load_ubyte v6, v[6:7]
5425; VI-NEXT:    s_add_u32 s2, s0, 53
5426; VI-NEXT:    s_addc_u32 s3, s1, 0
5427; VI-NEXT:    v_mov_b32_e32 v0, s2
5428; VI-NEXT:    v_mov_b32_e32 v1, s3
5429; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
5430; VI-NEXT:    s_load_dword s2, s[0:1], 0x24
5431; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x28
5432; VI-NEXT:    v_mov_b32_e32 v2, 0
5433; VI-NEXT:    v_mov_b32_e32 v3, 0
5434; VI-NEXT:    s_waitcnt lgkmcnt(0)
5435; VI-NEXT:    v_mov_b32_e32 v7, s2
5436; VI-NEXT:    v_mov_b32_e32 v5, s1
5437; VI-NEXT:    v_mov_b32_e32 v4, s0
5438; VI-NEXT:    flat_store_dword v[2:3], v7
5439; VI-NEXT:    s_waitcnt vmcnt(0)
5440; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
5441; VI-NEXT:    s_waitcnt vmcnt(0)
5442; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
5443; VI-NEXT:    v_or_b32_e32 v4, v4, v9
5444; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
5445; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5446; VI-NEXT:    v_or_b32_e32 v4, v5, v4
5447; VI-NEXT:    flat_store_dword v[2:3], v4
5448; VI-NEXT:    s_waitcnt vmcnt(0)
5449; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5450; VI-NEXT:    s_waitcnt vmcnt(0)
5451; VI-NEXT:    s_endpgm
5452;
5453; GFX9-LABEL: packed_struct_argument_alignment:
5454; GFX9:       ; %bb.0:
5455; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5456; GFX9-NEXT:    global_load_dword v6, v2, s[4:5] offset:13
5457; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:17
5458; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
5459; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4
5460; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5461; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5463; GFX9-NEXT:    v_mov_b32_e32 v7, s2
5464; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5465; GFX9-NEXT:    v_mov_b32_e32 v4, s0
5466; GFX9-NEXT:    global_store_dword v[2:3], v7, off
5467; GFX9-NEXT:    s_waitcnt vmcnt(0)
5468; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
5469; GFX9-NEXT:    s_waitcnt vmcnt(0)
5470; GFX9-NEXT:    global_store_dword v[2:3], v6, off
5471; GFX9-NEXT:    s_waitcnt vmcnt(0)
5472; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
5473; GFX9-NEXT:    s_waitcnt vmcnt(0)
5474; GFX9-NEXT:    s_endpgm
5475;
5476; EG-LABEL: packed_struct_argument_alignment:
5477; EG:       ; %bb.0:
5478; EG-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5479; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
5480; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5481; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5482; EG-NEXT:    ALU 2, @25, KC0[], KC1[]
5483; EG-NEXT:    TEX 0 @12
5484; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5485; EG-NEXT:    TEX 0 @14
5486; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5487; EG-NEXT:    TEX 0 @16
5488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
5489; EG-NEXT:    CF_END
5490; EG-NEXT:    Fetch clause starting at 12:
5491; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5492; EG-NEXT:    Fetch clause starting at 14:
5493; EG-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5494; EG-NEXT:    Fetch clause starting at 16:
5495; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5496; EG-NEXT:    ALU clause starting at 18:
5497; EG-NEXT:     MOV T0.X, KC0[2].Z,
5498; EG-NEXT:     MOV * T1.X, literal.x,
5499; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5500; EG-NEXT:     MOV T2.X, KC0[2].W,
5501; EG-NEXT:     MOV * T3.X, literal.x,
5502; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5503; EG-NEXT:     MOV * T4.X, KC0[2].Y,
5504; EG-NEXT:    ALU clause starting at 25:
5505; EG-NEXT:     MOV T0.X, 0.0,
5506; EG-NEXT:     MOV * T2.X, 0.0,
5507; EG-NEXT:     MOV * T4.X, 0.0,
5508;
5509; CM-LABEL: packed_struct_argument_alignment:
5510; CM:       ; %bb.0:
5511; CM-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5512; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5513; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5514; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5515; CM-NEXT:    ALU 2, @25, KC0[], KC1[]
5516; CM-NEXT:    TEX 0 @12
5517; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5518; CM-NEXT:    TEX 0 @14
5519; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5520; CM-NEXT:    TEX 0 @16
5521; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5522; CM-NEXT:    CF_END
5523; CM-NEXT:    Fetch clause starting at 12:
5524; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5525; CM-NEXT:    Fetch clause starting at 14:
5526; CM-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5527; CM-NEXT:    Fetch clause starting at 16:
5528; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5529; CM-NEXT:    ALU clause starting at 18:
5530; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5531; CM-NEXT:     MOV * T1.X, literal.x,
5532; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5533; CM-NEXT:     MOV * T2.X, KC0[2].W,
5534; CM-NEXT:     MOV * T3.X, literal.x,
5535; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5536; CM-NEXT:     MOV * T4.X, KC0[2].Y,
5537; CM-NEXT:    ALU clause starting at 25:
5538; CM-NEXT:     MOV * T0.X, 0.0,
5539; CM-NEXT:     MOV * T2.X, 0.0,
5540; CM-NEXT:     MOV * T4.X, 0.0,
5541  %val0 = extractvalue <{i32, i64}> %arg0, 0
5542  %val1 = extractvalue <{i32, i64}> %arg0, 1
5543  %val2 = extractvalue <{i32, i64}> %arg1, 0
5544  %val3 = extractvalue <{i32, i64}> %arg1, 1
5545  store volatile i32 %val0, i32 addrspace(1)* null
5546  store volatile i64 %val1, i64 addrspace(1)* null
5547  store volatile i32 %val2, i32 addrspace(1)* null
5548  store volatile i64 %val3, i64 addrspace(1)* null
5549  ret void
5550}
5551
5552define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
5553; SI-LABEL: struct_argument_alignment_after:
5554; SI:       ; %bb.0:
5555; SI-NEXT:    s_load_dword s12, s[0:1], 0x9
5556; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
5557; SI-NEXT:    s_load_dword s13, s[0:1], 0xf
5558; SI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x11
5559; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
5560; SI-NEXT:    s_mov_b32 s4, 0
5561; SI-NEXT:    s_mov_b32 s7, 0xf000
5562; SI-NEXT:    s_mov_b32 s6, -1
5563; SI-NEXT:    s_mov_b32 s5, s4
5564; SI-NEXT:    s_waitcnt lgkmcnt(0)
5565; SI-NEXT:    v_mov_b32_e32 v0, s12
5566; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5567; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5568; SI-NEXT:    v_mov_b32_e32 v0, s8
5569; SI-NEXT:    v_mov_b32_e32 v1, s9
5570; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5571; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5572; SI-NEXT:    v_mov_b32_e32 v0, s13
5573; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5574; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5575; SI-NEXT:    v_mov_b32_e32 v0, s10
5576; SI-NEXT:    v_mov_b32_e32 v1, s11
5577; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5578; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5579; SI-NEXT:    v_mov_b32_e32 v0, s0
5580; SI-NEXT:    v_mov_b32_e32 v1, s1
5581; SI-NEXT:    v_mov_b32_e32 v2, s2
5582; SI-NEXT:    v_mov_b32_e32 v3, s3
5583; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5584; SI-NEXT:    s_waitcnt vmcnt(0)
5585; SI-NEXT:    s_endpgm
5586;
5587; VI-LABEL: struct_argument_alignment_after:
5588; VI:       ; %bb.0:
5589; VI-NEXT:    s_load_dword s8, s[0:1], 0x24
5590; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5591; VI-NEXT:    s_load_dword s9, s[0:1], 0x3c
5592; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x44
5593; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
5594; VI-NEXT:    v_mov_b32_e32 v4, 0
5595; VI-NEXT:    v_mov_b32_e32 v5, 0
5596; VI-NEXT:    s_waitcnt lgkmcnt(0)
5597; VI-NEXT:    v_mov_b32_e32 v0, s8
5598; VI-NEXT:    flat_store_dword v[4:5], v0
5599; VI-NEXT:    s_waitcnt vmcnt(0)
5600; VI-NEXT:    v_mov_b32_e32 v0, s4
5601; VI-NEXT:    v_mov_b32_e32 v1, s5
5602; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5603; VI-NEXT:    s_waitcnt vmcnt(0)
5604; VI-NEXT:    v_mov_b32_e32 v0, s9
5605; VI-NEXT:    flat_store_dword v[4:5], v0
5606; VI-NEXT:    s_waitcnt vmcnt(0)
5607; VI-NEXT:    v_mov_b32_e32 v0, s6
5608; VI-NEXT:    v_mov_b32_e32 v1, s7
5609; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5610; VI-NEXT:    s_waitcnt vmcnt(0)
5611; VI-NEXT:    v_mov_b32_e32 v0, s0
5612; VI-NEXT:    v_mov_b32_e32 v1, s1
5613; VI-NEXT:    v_mov_b32_e32 v2, s2
5614; VI-NEXT:    v_mov_b32_e32 v3, s3
5615; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
5616; VI-NEXT:    s_waitcnt vmcnt(0)
5617; VI-NEXT:    s_endpgm
5618;
5619; GFX9-LABEL: struct_argument_alignment_after:
5620; GFX9:       ; %bb.0:
5621; GFX9-NEXT:    s_load_dword s10, s[4:5], 0x0
5622; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
5623; GFX9-NEXT:    s_load_dword s11, s[4:5], 0x18
5624; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
5625; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
5626; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5627; GFX9-NEXT:    v_mov_b32_e32 v5, 0
5628; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5629; GFX9-NEXT:    v_mov_b32_e32 v0, s10
5630; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5631; GFX9-NEXT:    s_waitcnt vmcnt(0)
5632; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5633; GFX9-NEXT:    v_mov_b32_e32 v1, s7
5634; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5635; GFX9-NEXT:    s_waitcnt vmcnt(0)
5636; GFX9-NEXT:    v_mov_b32_e32 v0, s11
5637; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5638; GFX9-NEXT:    s_waitcnt vmcnt(0)
5639; GFX9-NEXT:    v_mov_b32_e32 v0, s8
5640; GFX9-NEXT:    v_mov_b32_e32 v1, s9
5641; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5642; GFX9-NEXT:    s_waitcnt vmcnt(0)
5643; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5644; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5645; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5646; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5647; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
5648; GFX9-NEXT:    s_waitcnt vmcnt(0)
5649; GFX9-NEXT:    s_endpgm
5650;
5651; EG-LABEL: struct_argument_alignment_after:
5652; EG:       ; %bb.0:
5653; EG-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5654; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
5655; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
5656; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
5657; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
5658; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
5659; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
5660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
5661; EG-NEXT:    CF_END
5662; EG-NEXT:    PAD
5663; EG-NEXT:    ALU clause starting at 10:
5664; EG-NEXT:     MOV * T0.W, KC0[6].X,
5665; EG-NEXT:     MOV * T0.Z, KC0[5].W,
5666; EG-NEXT:     MOV * T0.Y, KC0[5].Z,
5667; EG-NEXT:     MOV T0.X, KC0[5].Y,
5668; EG-NEXT:     MOV * T1.X, KC0[4].Y,
5669; EG-NEXT:     MOV T2.X, KC0[4].Z,
5670; EG-NEXT:     MOV * T3.X, KC0[3].W,
5671; EG-NEXT:     MOV T4.X, KC0[2].W,
5672; EG-NEXT:     MOV * T5.X, literal.x,
5673; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5674; EG-NEXT:     MOV T6.X, KC0[3].X,
5675; EG-NEXT:     MOV * T7.X, literal.x,
5676; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5677; EG-NEXT:     MOV * T8.X, KC0[2].Y,
5678;
5679; CM-LABEL: struct_argument_alignment_after:
5680; CM:       ; %bb.0:
5681; CM-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5682; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
5683; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
5684; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
5685; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
5686; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
5687; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
5688; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
5689; CM-NEXT:    CF_END
5690; CM-NEXT:    PAD
5691; CM-NEXT:    ALU clause starting at 10:
5692; CM-NEXT:     MOV * T0.W, KC0[6].X,
5693; CM-NEXT:     MOV * T0.Z, KC0[5].W,
5694; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
5695; CM-NEXT:     MOV * T0.X, KC0[5].Y,
5696; CM-NEXT:     MOV * T1.X, KC0[4].Y,
5697; CM-NEXT:     MOV * T2.X, KC0[4].Z,
5698; CM-NEXT:     MOV * T3.X, KC0[3].W,
5699; CM-NEXT:     MOV * T4.X, KC0[2].W,
5700; CM-NEXT:     MOV * T5.X, literal.x,
5701; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5702; CM-NEXT:     MOV * T6.X, KC0[3].X,
5703; CM-NEXT:     MOV * T7.X, literal.x,
5704; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5705; CM-NEXT:     MOV * T8.X, KC0[2].Y,
5706  %val0 = extractvalue {i32, i64} %arg0, 0
5707  %val1 = extractvalue {i32, i64} %arg0, 1
5708  %val2 = extractvalue {i32, i64} %arg2, 0
5709  %val3 = extractvalue {i32, i64} %arg2, 1
5710  store volatile i32 %val0, i32 addrspace(1)* null
5711  store volatile i64 %val1, i64 addrspace(1)* null
5712  store volatile i32 %val2, i32 addrspace(1)* null
5713  store volatile i64 %val3, i64 addrspace(1)* null
5714  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
5715  ret void
5716}
5717
5718define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
5719; SI-LABEL: array_3xi32:
5720; SI:       ; %bb.0:
5721; SI-NEXT:    s_load_dword s4, s[0:1], 0xc
5722; SI-NEXT:    s_load_dword s5, s[0:1], 0x9
5723; SI-NEXT:    s_load_dword s6, s[0:1], 0xa
5724; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
5725; SI-NEXT:    s_mov_b32 s3, 0xf000
5726; SI-NEXT:    s_mov_b32 s2, -1
5727; SI-NEXT:    s_waitcnt lgkmcnt(0)
5728; SI-NEXT:    v_mov_b32_e32 v0, s5
5729; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
5730; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5731; SI-NEXT:    v_mov_b32_e32 v0, s4
5732; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5733; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5734; SI-NEXT:    v_mov_b32_e32 v0, s0
5735; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5736; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5737; SI-NEXT:    v_mov_b32_e32 v0, s6
5738; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5739; SI-NEXT:    s_waitcnt vmcnt(0)
5740; SI-NEXT:    s_endpgm
5741;
5742; VI-LABEL: array_3xi32:
5743; VI:       ; %bb.0:
5744; VI-NEXT:    s_load_dword s2, s[0:1], 0x24
5745; VI-NEXT:    s_load_dword s3, s[0:1], 0x30
5746; VI-NEXT:    s_load_dword s4, s[0:1], 0x28
5747; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
5748; VI-NEXT:    s_waitcnt lgkmcnt(0)
5749; VI-NEXT:    v_mov_b32_e32 v0, s2
5750; VI-NEXT:    v_mov_b32_e32 v1, s3
5751; VI-NEXT:    flat_store_short v[0:1], v0
5752; VI-NEXT:    s_waitcnt vmcnt(0)
5753; VI-NEXT:    flat_store_dword v[0:1], v1
5754; VI-NEXT:    s_waitcnt vmcnt(0)
5755; VI-NEXT:    v_mov_b32_e32 v0, s0
5756; VI-NEXT:    flat_store_dword v[0:1], v0
5757; VI-NEXT:    s_waitcnt vmcnt(0)
5758; VI-NEXT:    v_mov_b32_e32 v0, s4
5759; VI-NEXT:    flat_store_dword v[0:1], v0
5760; VI-NEXT:    s_waitcnt vmcnt(0)
5761; VI-NEXT:    s_endpgm
5762;
5763; GFX9-LABEL: array_3xi32:
5764; GFX9:       ; %bb.0:
5765; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
5766; GFX9-NEXT:    s_load_dword s1, s[4:5], 0xc
5767; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x4
5768; GFX9-NEXT:    s_load_dword s3, s[4:5], 0x8
5769; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5770; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5771; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5772; GFX9-NEXT:    global_store_short v[0:1], v0, off
5773; GFX9-NEXT:    s_waitcnt vmcnt(0)
5774; GFX9-NEXT:    global_store_dword v[0:1], v1, off
5775; GFX9-NEXT:    s_waitcnt vmcnt(0)
5776; GFX9-NEXT:    v_mov_b32_e32 v0, s3
5777; GFX9-NEXT:    global_store_dword v[0:1], v0, off
5778; GFX9-NEXT:    s_waitcnt vmcnt(0)
5779; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5780; GFX9-NEXT:    global_store_dword v[0:1], v0, off
5781; GFX9-NEXT:    s_waitcnt vmcnt(0)
5782; GFX9-NEXT:    s_endpgm
5783;
5784; EG-LABEL: array_3xi32:
5785; EG:       ; %bb.0:
5786; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
5787; EG-NEXT:    TEX 0 @8
5788; EG-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5789; EG-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5790; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
5791; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
5792; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
5793; EG-NEXT:    CF_END
5794; EG-NEXT:    Fetch clause starting at 8:
5795; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5796; EG-NEXT:    ALU clause starting at 10:
5797; EG-NEXT:     MOV * T0.X, 0.0,
5798; EG-NEXT:    ALU clause starting at 11:
5799; EG-NEXT:     AND_INT T0.X, T0.X, literal.x,
5800; EG-NEXT:     MOV * T0.W, literal.x,
5801; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5802; EG-NEXT:     MOV T0.Y, 0.0,
5803; EG-NEXT:     MOV * T0.Z, 0.0,
5804; EG-NEXT:     MOV T1.X, KC0[2].Z,
5805; EG-NEXT:     MOV * T2.X, KC0[2].W,
5806; EG-NEXT:     MOV T3.X, KC0[3].X,
5807; EG-NEXT:     MOV * T4.X, literal.x,
5808; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5809;
5810; CM-LABEL: array_3xi32:
5811; CM:       ; %bb.0:
5812; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
5813; CM-NEXT:    TEX 0 @8
5814; CM-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5815; CM-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5816; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
5817; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
5818; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5819; CM-NEXT:    CF_END
5820; CM-NEXT:    Fetch clause starting at 8:
5821; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5822; CM-NEXT:    ALU clause starting at 10:
5823; CM-NEXT:     MOV * T0.X, 0.0,
5824; CM-NEXT:    ALU clause starting at 11:
5825; CM-NEXT:     AND_INT T0.X, T0.X, literal.x,
5826; CM-NEXT:     MOV * T0.W, literal.x,
5827; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5828; CM-NEXT:     MOV T0.Y, 0.0,
5829; CM-NEXT:     MOV * T0.Z, 0.0,
5830; CM-NEXT:     MOV * T1.X, KC0[2].Z,
5831; CM-NEXT:     MOV * T2.X, KC0[2].W,
5832; CM-NEXT:     MOV * T3.X, KC0[3].X,
5833; CM-NEXT:     MOV * T4.X, literal.x,
5834; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5835  store volatile i16 %arg0, i16 addrspace(1)* undef
5836  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
5837  ret void
5838}
5839
5840; FIXME: Why not all scalar loads?
5841define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
5842; SI-LABEL: array_3xi16:
5843; SI:       ; %bb.0:
5844; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
5845; SI-NEXT:    s_mov_b32 s3, 0xf000
5846; SI-NEXT:    s_mov_b32 s2, -1
5847; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:42
5848; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:40
5849; SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:38
5850; SI-NEXT:    s_waitcnt lgkmcnt(0)
5851; SI-NEXT:    v_mov_b32_e32 v3, s4
5852; SI-NEXT:    buffer_store_byte v3, off, s[0:3], 0
5853; SI-NEXT:    s_waitcnt vmcnt(0)
5854; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
5855; SI-NEXT:    s_waitcnt vmcnt(0)
5856; SI-NEXT:    buffer_store_short v1, off, s[0:3], 0
5857; SI-NEXT:    s_waitcnt vmcnt(0)
5858; SI-NEXT:    buffer_store_short v2, off, s[0:3], 0
5859; SI-NEXT:    s_waitcnt vmcnt(0)
5860; SI-NEXT:    s_endpgm
5861;
5862; VI-LABEL: array_3xi16:
5863; VI:       ; %bb.0:
5864; VI-NEXT:    s_add_u32 s2, s0, 38
5865; VI-NEXT:    s_addc_u32 s3, s1, 0
5866; VI-NEXT:    s_add_u32 s4, s2, 2
5867; VI-NEXT:    s_addc_u32 s5, s3, 0
5868; VI-NEXT:    v_mov_b32_e32 v0, s2
5869; VI-NEXT:    v_mov_b32_e32 v1, s3
5870; VI-NEXT:    s_add_u32 s2, s0, 42
5871; VI-NEXT:    s_addc_u32 s3, s1, 0
5872; VI-NEXT:    v_mov_b32_e32 v2, s2
5873; VI-NEXT:    v_mov_b32_e32 v3, s3
5874; VI-NEXT:    flat_load_ushort v4, v[0:1]
5875; VI-NEXT:    flat_load_ushort v2, v[2:3]
5876; VI-NEXT:    v_mov_b32_e32 v0, s4
5877; VI-NEXT:    v_mov_b32_e32 v1, s5
5878; VI-NEXT:    flat_load_ushort v0, v[0:1]
5879; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
5880; VI-NEXT:    s_waitcnt lgkmcnt(0)
5881; VI-NEXT:    v_mov_b32_e32 v1, s0
5882; VI-NEXT:    s_waitcnt vmcnt(0)
5883; VI-NEXT:    flat_store_byte v[0:1], v1
5884; VI-NEXT:    s_waitcnt vmcnt(0)
5885; VI-NEXT:    flat_store_short v[0:1], v2
5886; VI-NEXT:    s_waitcnt vmcnt(0)
5887; VI-NEXT:    flat_store_short v[0:1], v4
5888; VI-NEXT:    s_waitcnt vmcnt(0)
5889; VI-NEXT:    flat_store_short v[0:1], v0
5890; VI-NEXT:    s_waitcnt vmcnt(0)
5891; VI-NEXT:    s_endpgm
5892;
5893; GFX9-LABEL: array_3xi16:
5894; GFX9:       ; %bb.0:
5895; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5896; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:6
5897; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] offset:4
5898; GFX9-NEXT:    global_load_ushort v3, v0, s[4:5] offset:2
5899; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
5900; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5901; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5902; GFX9-NEXT:    s_waitcnt vmcnt(2)
5903; GFX9-NEXT:    global_store_byte v[0:1], v0, off
5904; GFX9-NEXT:    s_waitcnt vmcnt(0)
5905; GFX9-NEXT:    global_store_short v[0:1], v1, off
5906; GFX9-NEXT:    s_waitcnt vmcnt(0)
5907; GFX9-NEXT:    global_store_short v[0:1], v2, off
5908; GFX9-NEXT:    s_waitcnt vmcnt(0)
5909; GFX9-NEXT:    global_store_short v[0:1], v3, off
5910; GFX9-NEXT:    s_waitcnt vmcnt(0)
5911; GFX9-NEXT:    s_endpgm
5912;
5913; EG-LABEL: array_3xi16:
5914; EG:       ; %bb.0:
5915; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
5916; EG-NEXT:    TEX 1 @12
5917; EG-NEXT:    ALU 11, @21, KC0[], KC1[]
5918; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5919; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5920; EG-NEXT:    TEX 0 @16
5921; EG-NEXT:    ALU 3, @33, KC0[], KC1[]
5922; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5923; EG-NEXT:    TEX 0 @18
5924; EG-NEXT:    ALU 3, @37, KC0[], KC1[]
5925; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5926; EG-NEXT:    CF_END
5927; EG-NEXT:    Fetch clause starting at 12:
5928; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5929; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5930; EG-NEXT:    Fetch clause starting at 16:
5931; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5932; EG-NEXT:    Fetch clause starting at 18:
5933; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5934; EG-NEXT:    ALU clause starting at 20:
5935; EG-NEXT:     MOV * T0.X, 0.0,
5936; EG-NEXT:    ALU clause starting at 21:
5937; EG-NEXT:     AND_INT T1.X, T1.X, literal.x,
5938; EG-NEXT:     MOV * T1.W, literal.x,
5939; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5940; EG-NEXT:     MOV * T1.Y, 0.0,
5941; EG-NEXT:     AND_INT T2.X, T2.X, literal.x,
5942; EG-NEXT:     MOV * T2.W, literal.x,
5943; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5944; EG-NEXT:     MOV T2.Y, 0.0,
5945; EG-NEXT:     MOV T1.Z, 0.0,
5946; EG-NEXT:     MOV * T2.Z, 0.0,
5947; EG-NEXT:     MOV * T3.X, literal.x,
5948; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5949; EG-NEXT:    ALU clause starting at 33:
5950; EG-NEXT:     AND_INT T2.X, T1.X, literal.x,
5951; EG-NEXT:     MOV T2.Y, 0.0,
5952; EG-NEXT:     MOV * T2.Z, 0.0,
5953; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5954; EG-NEXT:    ALU clause starting at 37:
5955; EG-NEXT:     AND_INT T2.X, T0.X, literal.x,
5956; EG-NEXT:     MOV T2.Y, 0.0,
5957; EG-NEXT:     MOV * T2.Z, 0.0,
5958; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5959;
5960; CM-LABEL: array_3xi16:
5961; CM:       ; %bb.0:
5962; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
5963; CM-NEXT:    TEX 1 @12
5964; CM-NEXT:    ALU 11, @21, KC0[], KC1[]
5965; CM-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5966; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5967; CM-NEXT:    TEX 0 @16
5968; CM-NEXT:    ALU 3, @33, KC0[], KC1[]
5969; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5970; CM-NEXT:    TEX 0 @18
5971; CM-NEXT:    ALU 3, @37, KC0[], KC1[]
5972; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5973; CM-NEXT:    CF_END
5974; CM-NEXT:    Fetch clause starting at 12:
5975; CM-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5976; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5977; CM-NEXT:    Fetch clause starting at 16:
5978; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5979; CM-NEXT:    Fetch clause starting at 18:
5980; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5981; CM-NEXT:    ALU clause starting at 20:
5982; CM-NEXT:     MOV * T0.X, 0.0,
5983; CM-NEXT:    ALU clause starting at 21:
5984; CM-NEXT:     AND_INT T1.X, T1.X, literal.x,
5985; CM-NEXT:     MOV * T1.W, literal.x,
5986; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5987; CM-NEXT:     MOV * T1.Y, 0.0,
5988; CM-NEXT:     AND_INT T2.X, T2.X, literal.x,
5989; CM-NEXT:     MOV * T2.W, literal.x,
5990; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5991; CM-NEXT:     MOV T2.Y, 0.0,
5992; CM-NEXT:     MOV * T1.Z, 0.0,
5993; CM-NEXT:     MOV * T2.Z, 0.0,
5994; CM-NEXT:     MOV * T3.X, literal.x,
5995; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5996; CM-NEXT:    ALU clause starting at 33:
5997; CM-NEXT:     AND_INT T2.X, T1.X, literal.x,
5998; CM-NEXT:     MOV T2.Y, 0.0,
5999; CM-NEXT:     MOV * T2.Z, 0.0,
6000; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6001; CM-NEXT:    ALU clause starting at 37:
6002; CM-NEXT:     AND_INT T2.X, T0.X, literal.x,
6003; CM-NEXT:     MOV T2.Y, 0.0,
6004; CM-NEXT:     MOV * T2.Z, 0.0,
6005; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6006  store volatile i8 %arg0, i8 addrspace(1)* undef
6007  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
6008  ret void
6009}
6010
6011define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
6012; SI-LABEL: small_array_round_down_offset:
6013; SI:       ; %bb.0:
6014; SI-NEXT:    s_mov_b32 s3, 0xf000
6015; SI-NEXT:    s_mov_b32 s2, -1
6016; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:37
6017; SI-NEXT:    s_waitcnt vmcnt(0)
6018; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
6019; SI-NEXT:    s_waitcnt vmcnt(0)
6020; SI-NEXT:    s_endpgm
6021;
6022; VI-LABEL: small_array_round_down_offset:
6023; VI:       ; %bb.0:
6024; VI-NEXT:    s_add_u32 s0, s0, 37
6025; VI-NEXT:    s_addc_u32 s1, s1, 0
6026; VI-NEXT:    v_mov_b32_e32 v0, s0
6027; VI-NEXT:    v_mov_b32_e32 v1, s1
6028; VI-NEXT:    flat_load_ubyte v0, v[0:1]
6029; VI-NEXT:    s_waitcnt vmcnt(0)
6030; VI-NEXT:    flat_store_byte v[0:1], v0
6031; VI-NEXT:    s_waitcnt vmcnt(0)
6032; VI-NEXT:    s_endpgm
6033;
6034; GFX9-LABEL: small_array_round_down_offset:
6035; GFX9:       ; %bb.0:
6036; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6037; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5] offset:1
6038; GFX9-NEXT:    s_waitcnt vmcnt(0)
6039; GFX9-NEXT:    global_store_byte v[0:1], v0, off
6040; GFX9-NEXT:    s_waitcnt vmcnt(0)
6041; GFX9-NEXT:    s_endpgm
6042;
6043; EGCM-LABEL: small_array_round_down_offset:
6044; EGCM:       ; %bb.0:
6045; EGCM-NEXT:    ALU 0, @8, KC0[], KC1[]
6046; EGCM-NEXT:    TEX 0 @6
6047; EGCM-NEXT:    ALU 6, @9, KC0[], KC1[]
6048; EGCM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
6049; EGCM-NEXT:    CF_END
6050; EGCM-NEXT:    PAD
6051; EGCM-NEXT:    Fetch clause starting at 6:
6052; EGCM-NEXT:     VTX_READ_8 T0.X, T0.X, 37, #3
6053; EGCM-NEXT:    ALU clause starting at 8:
6054; EGCM-NEXT:     MOV * T0.X, 0.0,
6055; EGCM-NEXT:    ALU clause starting at 9:
6056; EGCM-NEXT:     AND_INT T0.X, T0.X, literal.x,
6057; EGCM-NEXT:     MOV * T0.W, literal.x,
6058; EGCM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
6059; EGCM-NEXT:     MOV T0.Y, 0.0,
6060; EGCM-NEXT:     MOV * T0.Z, 0.0,
6061; EGCM-NEXT:     MOV * T1.X, literal.x,
6062; EGCM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
6063  %val = extractvalue [1 x i8] %arg, 0
6064  store volatile i8 %val, i8 addrspace(1)* undef
6065  ret void
6066}
6067
6068define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
6069; SI-LABEL: byref_align_constant_i32_arg:
6070; SI:       ; %bb.0:
6071; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x49
6072; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6073; SI-NEXT:    s_mov_b32 s3, 0xf000
6074; SI-NEXT:    s_mov_b32 s2, -1
6075; SI-NEXT:    s_waitcnt lgkmcnt(0)
6076; SI-NEXT:    v_mov_b32_e32 v0, s4
6077; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6078; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6079; SI-NEXT:    v_mov_b32_e32 v0, s5
6080; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6081; SI-NEXT:    s_waitcnt vmcnt(0)
6082; SI-NEXT:    s_endpgm
6083;
6084; VI-LABEL: byref_align_constant_i32_arg:
6085; VI:       ; %bb.0:
6086; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6087; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x124
6088; VI-NEXT:    s_waitcnt lgkmcnt(0)
6089; VI-NEXT:    v_mov_b32_e32 v0, s2
6090; VI-NEXT:    v_mov_b32_e32 v1, s3
6091; VI-NEXT:    v_mov_b32_e32 v2, s0
6092; VI-NEXT:    v_mov_b32_e32 v3, s1
6093; VI-NEXT:    flat_store_dword v[0:1], v2
6094; VI-NEXT:    s_waitcnt vmcnt(0)
6095; VI-NEXT:    flat_store_dword v[0:1], v3
6096; VI-NEXT:    s_waitcnt vmcnt(0)
6097; VI-NEXT:    s_endpgm
6098;
6099; GFX9-LABEL: byref_align_constant_i32_arg:
6100; GFX9:       ; %bb.0:
6101; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x100
6102; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x0
6103; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6104; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6105; GFX9-NEXT:    v_mov_b32_e32 v1, s0
6106; GFX9-NEXT:    v_mov_b32_e32 v2, s1
6107; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
6108; GFX9-NEXT:    s_waitcnt vmcnt(0)
6109; GFX9-NEXT:    global_store_dword v0, v2, s[2:3]
6110; GFX9-NEXT:    s_waitcnt vmcnt(0)
6111; GFX9-NEXT:    s_endpgm
6112;
6113; EG-LABEL: byref_align_constant_i32_arg:
6114; EG:       ; %bb.0:
6115; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
6116; EG-NEXT:    TEX 0 @6
6117; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
6118; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
6119; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6120; EG-NEXT:    CF_END
6121; EG-NEXT:    Fetch clause starting at 6:
6122; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
6123; EG-NEXT:    ALU clause starting at 8:
6124; EG-NEXT:     MOV * T0.X, KC0[18].Y,
6125; EG-NEXT:    ALU clause starting at 9:
6126; EG-NEXT:     MOV T1.X, KC0[18].Z,
6127; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6128; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6129;
6130; CM-LABEL: byref_align_constant_i32_arg:
6131; CM:       ; %bb.0:
6132; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
6133; CM-NEXT:    TEX 0 @6
6134; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
6135; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
6136; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6137; CM-NEXT:    CF_END
6138; CM-NEXT:    Fetch clause starting at 6:
6139; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
6140; CM-NEXT:    ALU clause starting at 8:
6141; CM-NEXT:     MOV * T0.X, KC0[18].Y,
6142; CM-NEXT:    ALU clause starting at 9:
6143; CM-NEXT:     MOV * T1.X, KC0[18].Z,
6144; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6145; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6146  %in = load i32, i32 addrspace(4)* %in.byref
6147  store volatile i32 %in, i32 addrspace(1)* %out, align 4
6148  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
6149  ret void
6150}
6151
6152define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
6153; SI-LABEL: byref_natural_align_constant_v16i32_arg:
6154; SI:       ; %bb.0:
6155; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
6156; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
6157; SI-NEXT:    s_load_dword s0, s[0:1], 0x29
6158; SI-NEXT:    s_mov_b32 s23, 0xf000
6159; SI-NEXT:    s_mov_b32 s22, -1
6160; SI-NEXT:    s_waitcnt lgkmcnt(0)
6161; SI-NEXT:    v_mov_b32_e32 v0, s16
6162; SI-NEXT:    v_mov_b32_e32 v1, s17
6163; SI-NEXT:    v_mov_b32_e32 v2, s18
6164; SI-NEXT:    v_mov_b32_e32 v3, s19
6165; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48
6166; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6167; SI-NEXT:    v_mov_b32_e32 v0, s12
6168; SI-NEXT:    v_mov_b32_e32 v1, s13
6169; SI-NEXT:    v_mov_b32_e32 v2, s14
6170; SI-NEXT:    v_mov_b32_e32 v3, s15
6171; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32
6172; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6173; SI-NEXT:    v_mov_b32_e32 v0, s8
6174; SI-NEXT:    v_mov_b32_e32 v1, s9
6175; SI-NEXT:    v_mov_b32_e32 v2, s10
6176; SI-NEXT:    v_mov_b32_e32 v3, s11
6177; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16
6178; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6179; SI-NEXT:    v_mov_b32_e32 v0, s4
6180; SI-NEXT:    v_mov_b32_e32 v1, s5
6181; SI-NEXT:    v_mov_b32_e32 v2, s6
6182; SI-NEXT:    v_mov_b32_e32 v3, s7
6183; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
6184; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6185; SI-NEXT:    v_mov_b32_e32 v0, s0
6186; SI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
6187; SI-NEXT:    s_waitcnt vmcnt(0)
6188; SI-NEXT:    s_endpgm
6189;
6190; VI-LABEL: byref_natural_align_constant_v16i32_arg:
6191; VI:       ; %bb.0:
6192; VI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x64
6193; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6194; VI-NEXT:    s_load_dword s20, s[0:1], 0xa4
6195; VI-NEXT:    s_waitcnt lgkmcnt(0)
6196; VI-NEXT:    v_mov_b32_e32 v0, s16
6197; VI-NEXT:    s_add_u32 s0, s2, 48
6198; VI-NEXT:    s_addc_u32 s1, s3, 0
6199; VI-NEXT:    v_mov_b32_e32 v5, s1
6200; VI-NEXT:    v_mov_b32_e32 v4, s0
6201; VI-NEXT:    s_add_u32 s0, s2, 32
6202; VI-NEXT:    v_mov_b32_e32 v1, s17
6203; VI-NEXT:    v_mov_b32_e32 v2, s18
6204; VI-NEXT:    v_mov_b32_e32 v3, s19
6205; VI-NEXT:    s_addc_u32 s1, s3, 0
6206; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6207; VI-NEXT:    s_waitcnt vmcnt(0)
6208; VI-NEXT:    v_mov_b32_e32 v5, s1
6209; VI-NEXT:    v_mov_b32_e32 v4, s0
6210; VI-NEXT:    s_add_u32 s0, s2, 16
6211; VI-NEXT:    v_mov_b32_e32 v0, s12
6212; VI-NEXT:    v_mov_b32_e32 v1, s13
6213; VI-NEXT:    v_mov_b32_e32 v2, s14
6214; VI-NEXT:    v_mov_b32_e32 v3, s15
6215; VI-NEXT:    s_addc_u32 s1, s3, 0
6216; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6217; VI-NEXT:    s_waitcnt vmcnt(0)
6218; VI-NEXT:    v_mov_b32_e32 v5, s1
6219; VI-NEXT:    v_mov_b32_e32 v0, s8
6220; VI-NEXT:    v_mov_b32_e32 v1, s9
6221; VI-NEXT:    v_mov_b32_e32 v2, s10
6222; VI-NEXT:    v_mov_b32_e32 v3, s11
6223; VI-NEXT:    v_mov_b32_e32 v4, s0
6224; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6225; VI-NEXT:    s_waitcnt vmcnt(0)
6226; VI-NEXT:    v_mov_b32_e32 v5, s3
6227; VI-NEXT:    v_mov_b32_e32 v0, s4
6228; VI-NEXT:    v_mov_b32_e32 v1, s5
6229; VI-NEXT:    v_mov_b32_e32 v2, s6
6230; VI-NEXT:    v_mov_b32_e32 v3, s7
6231; VI-NEXT:    v_mov_b32_e32 v4, s2
6232; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6233; VI-NEXT:    s_waitcnt vmcnt(0)
6234; VI-NEXT:    v_mov_b32_e32 v0, s20
6235; VI-NEXT:    flat_store_dword v[4:5], v0
6236; VI-NEXT:    s_waitcnt vmcnt(0)
6237; VI-NEXT:    s_endpgm
6238;
6239; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
6240; GFX9:       ; %bb.0:
6241; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
6242; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6243; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x80
6244; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6245; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6246; GFX9-NEXT:    v_mov_b32_e32 v0, s20
6247; GFX9-NEXT:    v_mov_b32_e32 v1, s21
6248; GFX9-NEXT:    v_mov_b32_e32 v2, s22
6249; GFX9-NEXT:    v_mov_b32_e32 v3, s23
6250; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
6251; GFX9-NEXT:    s_waitcnt vmcnt(0)
6252; GFX9-NEXT:    v_mov_b32_e32 v0, s16
6253; GFX9-NEXT:    v_mov_b32_e32 v1, s17
6254; GFX9-NEXT:    v_mov_b32_e32 v2, s18
6255; GFX9-NEXT:    v_mov_b32_e32 v3, s19
6256; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
6257; GFX9-NEXT:    s_waitcnt vmcnt(0)
6258; GFX9-NEXT:    v_mov_b32_e32 v0, s12
6259; GFX9-NEXT:    v_mov_b32_e32 v1, s13
6260; GFX9-NEXT:    v_mov_b32_e32 v2, s14
6261; GFX9-NEXT:    v_mov_b32_e32 v3, s15
6262; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
6263; GFX9-NEXT:    s_waitcnt vmcnt(0)
6264; GFX9-NEXT:    v_mov_b32_e32 v0, s8
6265; GFX9-NEXT:    v_mov_b32_e32 v1, s9
6266; GFX9-NEXT:    v_mov_b32_e32 v2, s10
6267; GFX9-NEXT:    v_mov_b32_e32 v3, s11
6268; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
6269; GFX9-NEXT:    s_waitcnt vmcnt(0)
6270; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6271; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
6272; GFX9-NEXT:    s_waitcnt vmcnt(0)
6273; GFX9-NEXT:    s_endpgm
6274;
6275; EG-LABEL: byref_natural_align_constant_v16i32_arg:
6276; EG:       ; %bb.0:
6277; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6278; EG-NEXT:    TEX 0 @16
6279; EG-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6280; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
6281; EG-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6282; EG-NEXT:    TEX 0 @18
6283; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6284; EG-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6285; EG-NEXT:    TEX 0 @20
6286; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6287; EG-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6288; EG-NEXT:    TEX 0 @22
6289; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
6290; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6291; EG-NEXT:    CF_END
6292; EG-NEXT:    PAD
6293; EG-NEXT:    Fetch clause starting at 16:
6294; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6295; EG-NEXT:    Fetch clause starting at 18:
6296; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6297; EG-NEXT:    Fetch clause starting at 20:
6298; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6299; EG-NEXT:    Fetch clause starting at 22:
6300; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6301; EG-NEXT:    ALU clause starting at 24:
6302; EG-NEXT:     MOV * T0.X, KC0[6].Y,
6303; EG-NEXT:    ALU clause starting at 25:
6304; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6305; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6306; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
6307; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6308; EG-NEXT:    ALU clause starting at 29:
6309; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6310; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6311; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6312; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6313; EG-NEXT:    ALU clause starting at 33:
6314; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6315; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6316; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6317; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6318; EG-NEXT:    ALU clause starting at 37:
6319; EG-NEXT:     MOV T1.X, KC0[10].Y,
6320; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6321; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6322;
6323; CM-LABEL: byref_natural_align_constant_v16i32_arg:
6324; CM:       ; %bb.0:
6325; CM-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6326; CM-NEXT:    TEX 0 @16
6327; CM-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6328; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
6329; CM-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6330; CM-NEXT:    TEX 0 @18
6331; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6332; CM-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6333; CM-NEXT:    TEX 0 @20
6334; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6335; CM-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6336; CM-NEXT:    TEX 0 @22
6337; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
6338; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6339; CM-NEXT:    CF_END
6340; CM-NEXT:    PAD
6341; CM-NEXT:    Fetch clause starting at 16:
6342; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6343; CM-NEXT:    Fetch clause starting at 18:
6344; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6345; CM-NEXT:    Fetch clause starting at 20:
6346; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6347; CM-NEXT:    Fetch clause starting at 22:
6348; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6349; CM-NEXT:    ALU clause starting at 24:
6350; CM-NEXT:     MOV * T0.X, KC0[6].Y,
6351; CM-NEXT:    ALU clause starting at 25:
6352; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6353; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6354; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
6355; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6356; CM-NEXT:    ALU clause starting at 29:
6357; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6358; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6359; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6360; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6361; CM-NEXT:    ALU clause starting at 33:
6362; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6363; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6364; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6365; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6366; CM-NEXT:    ALU clause starting at 37:
6367; CM-NEXT:     MOV * T1.X, KC0[10].Y,
6368; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6369; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6370  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
6371  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
6372  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
6373  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
6374  ret void
6375}
6376