1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=VI %s
4; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs | FileCheck -check-prefixes=GFX9 %s
5; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -check-prefixes=EGCM,EG %s
6; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -check-prefixes=EGCM,CM %s
7
8define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
9; SI-LABEL: i8_arg:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
12; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
13; SI-NEXT:    s_mov_b32 s3, 0xf000
14; SI-NEXT:    s_waitcnt lgkmcnt(0)
15; SI-NEXT:    s_and_b32 s4, s2, 0xff
16; SI-NEXT:    s_mov_b32 s2, -1
17; SI-NEXT:    v_mov_b32_e32 v0, s4
18; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
19; SI-NEXT:    s_endpgm
20;
21; VI-LABEL: i8_arg:
22; VI:       ; %bb.0:
23; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
24; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    v_mov_b32_e32 v0, s2
27; VI-NEXT:    s_and_b32 s0, s0, 0xff
28; VI-NEXT:    v_mov_b32_e32 v1, s3
29; VI-NEXT:    v_mov_b32_e32 v2, s0
30; VI-NEXT:    flat_store_dword v[0:1], v2
31; VI-NEXT:    s_endpgm
32;
33; GFX9-LABEL: i8_arg:
34; GFX9:       ; %bb.0:
35; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
36; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
37; GFX9-NEXT:    v_mov_b32_e32 v0, 0
38; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
40; GFX9-NEXT:    v_mov_b32_e32 v1, s2
41; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
42; GFX9-NEXT:    s_endpgm
43;
44; EG-LABEL: i8_arg:
45; EG:       ; %bb.0:
46; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
47; EG-NEXT:    TEX 0 @6
48; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
49; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
50; EG-NEXT:    CF_END
51; EG-NEXT:    PAD
52; EG-NEXT:    Fetch clause starting at 6:
53; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
54; EG-NEXT:    ALU clause starting at 8:
55; EG-NEXT:     MOV * T0.X, 0.0,
56; EG-NEXT:    ALU clause starting at 9:
57; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
58; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
59;
60; CM-LABEL: i8_arg:
61; CM:       ; %bb.0:
62; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
63; CM-NEXT:    TEX 0 @6
64; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
65; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
66; CM-NEXT:    CF_END
67; CM-NEXT:    PAD
68; CM-NEXT:    Fetch clause starting at 6:
69; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
70; CM-NEXT:    ALU clause starting at 8:
71; CM-NEXT:     MOV * T0.X, 0.0,
72; CM-NEXT:    ALU clause starting at 9:
73; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
74; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
75  %ext = zext i8 %in to i32
76  store i32 %ext, i32 addrspace(1)* %out, align 4
77  ret void
78}
79
80define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
81; SI-LABEL: i8_zext_arg:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
84; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
85; SI-NEXT:    s_mov_b32 s3, 0xf000
86; SI-NEXT:    s_waitcnt lgkmcnt(0)
87; SI-NEXT:    s_and_b32 s4, s2, 0xff
88; SI-NEXT:    s_mov_b32 s2, -1
89; SI-NEXT:    v_mov_b32_e32 v0, s4
90; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: i8_zext_arg:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
96; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
97; VI-NEXT:    s_waitcnt lgkmcnt(0)
98; VI-NEXT:    v_mov_b32_e32 v0, s2
99; VI-NEXT:    s_and_b32 s0, s0, 0xff
100; VI-NEXT:    v_mov_b32_e32 v1, s3
101; VI-NEXT:    v_mov_b32_e32 v2, s0
102; VI-NEXT:    flat_store_dword v[0:1], v2
103; VI-NEXT:    s_endpgm
104;
105; GFX9-LABEL: i8_zext_arg:
106; GFX9:       ; %bb.0:
107; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
108; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
109; GFX9-NEXT:    v_mov_b32_e32 v0, 0
110; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
112; GFX9-NEXT:    v_mov_b32_e32 v1, s2
113; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
114; GFX9-NEXT:    s_endpgm
115;
116; EG-LABEL: i8_zext_arg:
117; EG:       ; %bb.0:
118; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
119; EG-NEXT:    TEX 0 @6
120; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
122; EG-NEXT:    CF_END
123; EG-NEXT:    PAD
124; EG-NEXT:    Fetch clause starting at 6:
125; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
126; EG-NEXT:    ALU clause starting at 8:
127; EG-NEXT:     MOV * T0.X, 0.0,
128; EG-NEXT:    ALU clause starting at 9:
129; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
131; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
132;
133; CM-LABEL: i8_zext_arg:
134; CM:       ; %bb.0:
135; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
136; CM-NEXT:    TEX 0 @6
137; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
138; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
139; CM-NEXT:    CF_END
140; CM-NEXT:    PAD
141; CM-NEXT:    Fetch clause starting at 6:
142; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
143; CM-NEXT:    ALU clause starting at 8:
144; CM-NEXT:     MOV * T0.X, 0.0,
145; CM-NEXT:    ALU clause starting at 9:
146; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
147; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
148; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
149; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
150  %ext = zext i8 %in to i32
151  store i32 %ext, i32 addrspace(1)* %out, align 4
152  ret void
153}
154
155define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
156; SI-LABEL: i8_sext_arg:
157; SI:       ; %bb.0:
158; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
159; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
160; SI-NEXT:    s_mov_b32 s3, 0xf000
161; SI-NEXT:    s_waitcnt lgkmcnt(0)
162; SI-NEXT:    s_sext_i32_i8 s4, s2
163; SI-NEXT:    s_mov_b32 s2, -1
164; SI-NEXT:    v_mov_b32_e32 v0, s4
165; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
166; SI-NEXT:    s_endpgm
167;
168; VI-LABEL: i8_sext_arg:
169; VI:       ; %bb.0:
170; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
171; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    v_mov_b32_e32 v0, s2
174; VI-NEXT:    s_sext_i32_i8 s0, s0
175; VI-NEXT:    v_mov_b32_e32 v1, s3
176; VI-NEXT:    v_mov_b32_e32 v2, s0
177; VI-NEXT:    flat_store_dword v[0:1], v2
178; VI-NEXT:    s_endpgm
179;
180; GFX9-LABEL: i8_sext_arg:
181; GFX9:       ; %bb.0:
182; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
183; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
184; GFX9-NEXT:    v_mov_b32_e32 v0, 0
185; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
186; GFX9-NEXT:    s_sext_i32_i8 s2, s2
187; GFX9-NEXT:    v_mov_b32_e32 v1, s2
188; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
189; GFX9-NEXT:    s_endpgm
190;
191; EG-LABEL: i8_sext_arg:
192; EG:       ; %bb.0:
193; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
194; EG-NEXT:    TEX 0 @6
195; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
196; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
197; EG-NEXT:    CF_END
198; EG-NEXT:    PAD
199; EG-NEXT:    Fetch clause starting at 6:
200; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
201; EG-NEXT:    ALU clause starting at 8:
202; EG-NEXT:     MOV * T0.X, 0.0,
203; EG-NEXT:    ALU clause starting at 9:
204; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
205; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
206; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
207;
208; CM-LABEL: i8_sext_arg:
209; CM:       ; %bb.0:
210; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
211; CM-NEXT:    TEX 0 @6
212; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
213; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
214; CM-NEXT:    CF_END
215; CM-NEXT:    PAD
216; CM-NEXT:    Fetch clause starting at 6:
217; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
218; CM-NEXT:    ALU clause starting at 8:
219; CM-NEXT:     MOV * T0.X, 0.0,
220; CM-NEXT:    ALU clause starting at 9:
221; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
222; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
223; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
224; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
225  %ext = sext i8 %in to i32
226  store i32 %ext, i32 addrspace(1)* %out, align 4
227  ret void
228}
229
230define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
231; SI-LABEL: i16_arg:
232; SI:       ; %bb.0:
233; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
234; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
235; SI-NEXT:    s_mov_b32 s3, 0xf000
236; SI-NEXT:    s_waitcnt lgkmcnt(0)
237; SI-NEXT:    s_and_b32 s4, s2, 0xffff
238; SI-NEXT:    s_mov_b32 s2, -1
239; SI-NEXT:    v_mov_b32_e32 v0, s4
240; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
241; SI-NEXT:    s_endpgm
242;
243; VI-LABEL: i16_arg:
244; VI:       ; %bb.0:
245; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
246; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    v_mov_b32_e32 v0, s2
249; VI-NEXT:    s_and_b32 s0, s0, 0xffff
250; VI-NEXT:    v_mov_b32_e32 v1, s3
251; VI-NEXT:    v_mov_b32_e32 v2, s0
252; VI-NEXT:    flat_store_dword v[0:1], v2
253; VI-NEXT:    s_endpgm
254;
255; GFX9-LABEL: i16_arg:
256; GFX9:       ; %bb.0:
257; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
258; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
259; GFX9-NEXT:    v_mov_b32_e32 v0, 0
260; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
261; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
262; GFX9-NEXT:    v_mov_b32_e32 v1, s2
263; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
264; GFX9-NEXT:    s_endpgm
265;
266; EG-LABEL: i16_arg:
267; EG:       ; %bb.0:
268; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
269; EG-NEXT:    TEX 0 @6
270; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
271; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
272; EG-NEXT:    CF_END
273; EG-NEXT:    PAD
274; EG-NEXT:    Fetch clause starting at 6:
275; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
276; EG-NEXT:    ALU clause starting at 8:
277; EG-NEXT:     MOV * T0.X, 0.0,
278; EG-NEXT:    ALU clause starting at 9:
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281;
282; CM-LABEL: i16_arg:
283; CM:       ; %bb.0:
284; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
285; CM-NEXT:    TEX 0 @6
286; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
287; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
288; CM-NEXT:    CF_END
289; CM-NEXT:    PAD
290; CM-NEXT:    Fetch clause starting at 6:
291; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
292; CM-NEXT:    ALU clause starting at 8:
293; CM-NEXT:     MOV * T0.X, 0.0,
294; CM-NEXT:    ALU clause starting at 9:
295; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
296; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
297  %ext = zext i16 %in to i32
298  store i32 %ext, i32 addrspace(1)* %out, align 4
299  ret void
300}
301
302define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
303; SI-LABEL: i16_zext_arg:
304; SI:       ; %bb.0:
305; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
306; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
307; SI-NEXT:    s_mov_b32 s3, 0xf000
308; SI-NEXT:    s_waitcnt lgkmcnt(0)
309; SI-NEXT:    s_and_b32 s4, s2, 0xffff
310; SI-NEXT:    s_mov_b32 s2, -1
311; SI-NEXT:    v_mov_b32_e32 v0, s4
312; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
313; SI-NEXT:    s_endpgm
314;
315; VI-LABEL: i16_zext_arg:
316; VI:       ; %bb.0:
317; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
318; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
319; VI-NEXT:    s_waitcnt lgkmcnt(0)
320; VI-NEXT:    v_mov_b32_e32 v0, s2
321; VI-NEXT:    s_and_b32 s0, s0, 0xffff
322; VI-NEXT:    v_mov_b32_e32 v1, s3
323; VI-NEXT:    v_mov_b32_e32 v2, s0
324; VI-NEXT:    flat_store_dword v[0:1], v2
325; VI-NEXT:    s_endpgm
326;
327; GFX9-LABEL: i16_zext_arg:
328; GFX9:       ; %bb.0:
329; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
330; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
331; GFX9-NEXT:    v_mov_b32_e32 v0, 0
332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
334; GFX9-NEXT:    v_mov_b32_e32 v1, s2
335; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
336; GFX9-NEXT:    s_endpgm
337;
338; EG-LABEL: i16_zext_arg:
339; EG:       ; %bb.0:
340; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
341; EG-NEXT:    TEX 0 @6
342; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
343; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
344; EG-NEXT:    CF_END
345; EG-NEXT:    PAD
346; EG-NEXT:    Fetch clause starting at 6:
347; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
348; EG-NEXT:    ALU clause starting at 8:
349; EG-NEXT:     MOV * T0.X, 0.0,
350; EG-NEXT:    ALU clause starting at 9:
351; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
352; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
353; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
354;
355; CM-LABEL: i16_zext_arg:
356; CM:       ; %bb.0:
357; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
358; CM-NEXT:    TEX 0 @6
359; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
360; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
361; CM-NEXT:    CF_END
362; CM-NEXT:    PAD
363; CM-NEXT:    Fetch clause starting at 6:
364; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
365; CM-NEXT:    ALU clause starting at 8:
366; CM-NEXT:     MOV * T0.X, 0.0,
367; CM-NEXT:    ALU clause starting at 9:
368; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
369; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
370; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
371; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
372  %ext = zext i16 %in to i32
373  store i32 %ext, i32 addrspace(1)* %out, align 4
374  ret void
375}
376
377define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
378; SI-LABEL: i16_sext_arg:
379; SI:       ; %bb.0:
380; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
381; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
382; SI-NEXT:    s_mov_b32 s3, 0xf000
383; SI-NEXT:    s_waitcnt lgkmcnt(0)
384; SI-NEXT:    s_sext_i32_i16 s4, s2
385; SI-NEXT:    s_mov_b32 s2, -1
386; SI-NEXT:    v_mov_b32_e32 v0, s4
387; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: i16_sext_arg:
391; VI:       ; %bb.0:
392; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
393; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
394; VI-NEXT:    s_waitcnt lgkmcnt(0)
395; VI-NEXT:    v_mov_b32_e32 v0, s2
396; VI-NEXT:    s_sext_i32_i16 s0, s0
397; VI-NEXT:    v_mov_b32_e32 v1, s3
398; VI-NEXT:    v_mov_b32_e32 v2, s0
399; VI-NEXT:    flat_store_dword v[0:1], v2
400; VI-NEXT:    s_endpgm
401;
402; GFX9-LABEL: i16_sext_arg:
403; GFX9:       ; %bb.0:
404; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
405; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
406; GFX9-NEXT:    v_mov_b32_e32 v0, 0
407; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX9-NEXT:    s_sext_i32_i16 s2, s2
409; GFX9-NEXT:    v_mov_b32_e32 v1, s2
410; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
411; GFX9-NEXT:    s_endpgm
412;
413; EG-LABEL: i16_sext_arg:
414; EG:       ; %bb.0:
415; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
416; EG-NEXT:    TEX 0 @6
417; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
418; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
419; EG-NEXT:    CF_END
420; EG-NEXT:    PAD
421; EG-NEXT:    Fetch clause starting at 6:
422; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
423; EG-NEXT:    ALU clause starting at 8:
424; EG-NEXT:     MOV * T0.X, 0.0,
425; EG-NEXT:    ALU clause starting at 9:
426; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, literal.x,
427; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
428; EG-NEXT:    16(2.242078e-44), 2(2.802597e-45)
429;
430; CM-LABEL: i16_sext_arg:
431; CM:       ; %bb.0:
432; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
433; CM-NEXT:    TEX 0 @6
434; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
435; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
436; CM-NEXT:    CF_END
437; CM-NEXT:    PAD
438; CM-NEXT:    Fetch clause starting at 6:
439; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
440; CM-NEXT:    ALU clause starting at 8:
441; CM-NEXT:     MOV * T0.X, 0.0,
442; CM-NEXT:    ALU clause starting at 9:
443; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, literal.x,
444; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
445; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
446; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
447  %ext = sext i16 %in to i32
448  store i32 %ext, i32 addrspace(1)* %out, align 4
449  ret void
450}
451
452define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
453; SI-LABEL: i32_arg:
454; SI:       ; %bb.0: ; %entry
455; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
456; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
457; SI-NEXT:    s_mov_b32 s3, 0xf000
458; SI-NEXT:    s_mov_b32 s2, -1
459; SI-NEXT:    s_waitcnt lgkmcnt(0)
460; SI-NEXT:    v_mov_b32_e32 v0, s4
461; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
462; SI-NEXT:    s_endpgm
463;
464; VI-LABEL: i32_arg:
465; VI:       ; %bb.0: ; %entry
466; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
467; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
468; VI-NEXT:    s_waitcnt lgkmcnt(0)
469; VI-NEXT:    v_mov_b32_e32 v0, s2
470; VI-NEXT:    v_mov_b32_e32 v1, s3
471; VI-NEXT:    v_mov_b32_e32 v2, s0
472; VI-NEXT:    flat_store_dword v[0:1], v2
473; VI-NEXT:    s_endpgm
474;
475; GFX9-LABEL: i32_arg:
476; GFX9:       ; %bb.0: ; %entry
477; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
478; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
479; GFX9-NEXT:    v_mov_b32_e32 v0, 0
480; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX9-NEXT:    v_mov_b32_e32 v1, s2
482; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
483; GFX9-NEXT:    s_endpgm
484;
485; EG-LABEL: i32_arg:
486; EG:       ; %bb.0: ; %entry
487; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
489; EG-NEXT:    CF_END
490; EG-NEXT:    PAD
491; EG-NEXT:    ALU clause starting at 4:
492; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
493; EG-NEXT:     MOV * T1.X, KC0[2].Z,
494; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
495;
496; CM-LABEL: i32_arg:
497; CM:       ; %bb.0: ; %entry
498; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
499; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
500; CM-NEXT:    CF_END
501; CM-NEXT:    PAD
502; CM-NEXT:    ALU clause starting at 4:
503; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
504; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
505; CM-NEXT:     MOV * T1.X, KC0[2].Z,
506entry:
507  store i32 %in, i32 addrspace(1)* %out, align 4
508  ret void
509}
510
511define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
512; SI-LABEL: f32_arg:
513; SI:       ; %bb.0: ; %entry
514; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
515; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
516; SI-NEXT:    s_mov_b32 s3, 0xf000
517; SI-NEXT:    s_mov_b32 s2, -1
518; SI-NEXT:    s_waitcnt lgkmcnt(0)
519; SI-NEXT:    v_mov_b32_e32 v0, s4
520; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
521; SI-NEXT:    s_endpgm
522;
523; VI-LABEL: f32_arg:
524; VI:       ; %bb.0: ; %entry
525; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
526; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    v_mov_b32_e32 v0, s2
529; VI-NEXT:    v_mov_b32_e32 v1, s3
530; VI-NEXT:    v_mov_b32_e32 v2, s0
531; VI-NEXT:    flat_store_dword v[0:1], v2
532; VI-NEXT:    s_endpgm
533;
534; GFX9-LABEL: f32_arg:
535; GFX9:       ; %bb.0: ; %entry
536; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
537; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
538; GFX9-NEXT:    v_mov_b32_e32 v0, 0
539; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX9-NEXT:    v_mov_b32_e32 v1, s2
541; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
542; GFX9-NEXT:    s_endpgm
543;
544; EG-LABEL: f32_arg:
545; EG:       ; %bb.0: ; %entry
546; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
547; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
548; EG-NEXT:    CF_END
549; EG-NEXT:    PAD
550; EG-NEXT:    ALU clause starting at 4:
551; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
552; EG-NEXT:     MOV * T1.X, KC0[2].Z,
553; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
554;
555; CM-LABEL: f32_arg:
556; CM:       ; %bb.0: ; %entry
557; CM-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
558; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
559; CM-NEXT:    CF_END
560; CM-NEXT:    PAD
561; CM-NEXT:    ALU clause starting at 4:
562; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
563; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
564; CM-NEXT:     MOV * T1.X, KC0[2].Z,
565entry:
566  store float %in, float addrspace(1)* %out, align 4
567  ret void
568}
569
570define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
571; SI-LABEL: v2i8_arg:
572; SI:       ; %bb.0: ; %entry
573; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
574; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
575; SI-NEXT:    s_mov_b32 s3, 0xf000
576; SI-NEXT:    s_mov_b32 s2, -1
577; SI-NEXT:    s_waitcnt lgkmcnt(0)
578; SI-NEXT:    v_mov_b32_e32 v0, s4
579; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
580; SI-NEXT:    s_endpgm
581;
582; VI-LABEL: v2i8_arg:
583; VI:       ; %bb.0: ; %entry
584; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
585; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
586; VI-NEXT:    s_waitcnt lgkmcnt(0)
587; VI-NEXT:    v_mov_b32_e32 v0, s2
588; VI-NEXT:    v_mov_b32_e32 v1, s3
589; VI-NEXT:    v_mov_b32_e32 v2, s0
590; VI-NEXT:    flat_store_short v[0:1], v2
591; VI-NEXT:    s_endpgm
592;
593; GFX9-LABEL: v2i8_arg:
594; GFX9:       ; %bb.0: ; %entry
595; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
596; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
597; GFX9-NEXT:    v_mov_b32_e32 v0, 0
598; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX9-NEXT:    v_mov_b32_e32 v1, s2
600; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
601; GFX9-NEXT:    s_endpgm
602;
603; EG-LABEL: v2i8_arg:
604; EG:       ; %bb.0: ; %entry
605; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
606; EG-NEXT:    TEX 1 @6
607; EG-NEXT:    ALU 15, @11, KC0[CB0:0-32], KC1[]
608; EG-NEXT:    MEM_RAT MSKOR T4.XW, T5.X
609; EG-NEXT:    CF_END
610; EG-NEXT:    PAD
611; EG-NEXT:    Fetch clause starting at 6:
612; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
613; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
614; EG-NEXT:    ALU clause starting at 10:
615; EG-NEXT:     MOV * T4.X, 0.0,
616; EG-NEXT:    ALU clause starting at 11:
617; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
618; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
619; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
620; EG-NEXT:     AND_INT T2.W, KC0[2].Y, literal.x,
621; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
622; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
623; EG-NEXT:     AND_INT T0.W, PS, literal.x,
624; EG-NEXT:     LSHL * T1.W, PV.W, literal.y,
625; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
626; EG-NEXT:     LSHL T4.X, PV.W, PS,
627; EG-NEXT:     LSHL * T4.W, literal.x, PS,
628; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
629; EG-NEXT:     MOV T4.Y, 0.0,
630; EG-NEXT:     MOV * T4.Z, 0.0,
631; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
632; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
633;
634; CM-LABEL: v2i8_arg:
635; CM:       ; %bb.0: ; %entry
636; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
637; CM-NEXT:    TEX 1 @6
638; CM-NEXT:    ALU 15, @11, KC0[CB0:0-32], KC1[]
639; CM-NEXT:    MEM_RAT MSKOR T4.XW, T5.X
640; CM-NEXT:    CF_END
641; CM-NEXT:    PAD
642; CM-NEXT:    Fetch clause starting at 6:
643; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
644; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
645; CM-NEXT:    ALU clause starting at 10:
646; CM-NEXT:     MOV * T4.X, 0.0,
647; CM-NEXT:    ALU clause starting at 11:
648; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
649; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
650; CM-NEXT:    8(1.121039e-44), 255(3.573311e-43)
651; CM-NEXT:     AND_INT T1.Z, KC0[2].Y, literal.x,
652; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
653; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
654; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
655; CM-NEXT:     LSHL * T0.W, PV.Z, literal.y,
656; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
657; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
658; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
659; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
660; CM-NEXT:     MOV T4.Y, 0.0,
661; CM-NEXT:     MOV * T4.Z, 0.0,
662; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
663; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
664entry:
665  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
666  ret void
667}
668
669define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
670; SI-LABEL: v2i16_arg:
671; SI:       ; %bb.0: ; %entry
672; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
673; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
674; SI-NEXT:    s_mov_b32 s3, 0xf000
675; SI-NEXT:    s_mov_b32 s2, -1
676; SI-NEXT:    s_waitcnt lgkmcnt(0)
677; SI-NEXT:    v_mov_b32_e32 v0, s4
678; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
679; SI-NEXT:    s_endpgm
680;
681; VI-LABEL: v2i16_arg:
682; VI:       ; %bb.0: ; %entry
683; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
684; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
685; VI-NEXT:    s_waitcnt lgkmcnt(0)
686; VI-NEXT:    v_mov_b32_e32 v0, s2
687; VI-NEXT:    v_mov_b32_e32 v1, s3
688; VI-NEXT:    v_mov_b32_e32 v2, s0
689; VI-NEXT:    flat_store_dword v[0:1], v2
690; VI-NEXT:    s_endpgm
691;
692; GFX9-LABEL: v2i16_arg:
693; GFX9:       ; %bb.0: ; %entry
694; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
695; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
696; GFX9-NEXT:    v_mov_b32_e32 v0, 0
697; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX9-NEXT:    v_mov_b32_e32 v1, s2
699; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
700; GFX9-NEXT:    s_endpgm
701;
702; EG-LABEL: v2i16_arg:
703; EG:       ; %bb.0: ; %entry
704; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
705; EG-NEXT:    TEX 1 @6
706; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
707; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
708; EG-NEXT:    CF_END
709; EG-NEXT:    PAD
710; EG-NEXT:    Fetch clause starting at 6:
711; EG-NEXT:     VTX_READ_16 T5.X, T4.X, 42, #3
712; EG-NEXT:     VTX_READ_16 T4.X, T4.X, 40, #3
713; EG-NEXT:    ALU clause starting at 10:
714; EG-NEXT:     MOV * T4.X, 0.0,
715; EG-NEXT:    ALU clause starting at 11:
716; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
717; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
718; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
719; EG-NEXT:     OR_INT T4.X, PV.W, PS,
720; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
721; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
722;
723; CM-LABEL: v2i16_arg:
724; CM:       ; %bb.0: ; %entry
725; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
726; CM-NEXT:    TEX 1 @6
727; CM-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
728; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
729; CM-NEXT:    CF_END
730; CM-NEXT:    PAD
731; CM-NEXT:    Fetch clause starting at 6:
732; CM-NEXT:     VTX_READ_16 T5.X, T4.X, 42, #3
733; CM-NEXT:     VTX_READ_16 T4.X, T4.X, 40, #3
734; CM-NEXT:    ALU clause starting at 10:
735; CM-NEXT:     MOV * T4.X, 0.0,
736; CM-NEXT:    ALU clause starting at 11:
737; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
738; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
739; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
740; CM-NEXT:     OR_INT * T4.X, PV.Z, PV.W,
741; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
742; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
743entry:
744  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
745  ret void
746}
747
748define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
749; SI-LABEL: v2i32_arg:
750; SI:       ; %bb.0: ; %entry
751; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
752; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
753; SI-NEXT:    s_mov_b32 s3, 0xf000
754; SI-NEXT:    s_mov_b32 s2, -1
755; SI-NEXT:    s_waitcnt lgkmcnt(0)
756; SI-NEXT:    v_mov_b32_e32 v0, s4
757; SI-NEXT:    v_mov_b32_e32 v1, s5
758; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
759; SI-NEXT:    s_endpgm
760;
761; VI-LABEL: v2i32_arg:
762; VI:       ; %bb.0: ; %entry
763; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
764; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
765; VI-NEXT:    s_waitcnt lgkmcnt(0)
766; VI-NEXT:    v_mov_b32_e32 v0, s2
767; VI-NEXT:    v_mov_b32_e32 v3, s1
768; VI-NEXT:    v_mov_b32_e32 v1, s3
769; VI-NEXT:    v_mov_b32_e32 v2, s0
770; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
771; VI-NEXT:    s_endpgm
772;
773; GFX9-LABEL: v2i32_arg:
774; GFX9:       ; %bb.0: ; %entry
775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
776; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
777; GFX9-NEXT:    v_mov_b32_e32 v2, 0
778; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
779; GFX9-NEXT:    v_mov_b32_e32 v0, s2
780; GFX9-NEXT:    v_mov_b32_e32 v1, s3
781; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
782; GFX9-NEXT:    s_endpgm
783;
784; EG-LABEL: v2i32_arg:
785; EG:       ; %bb.0: ; %entry
786; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
787; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
788; EG-NEXT:    CF_END
789; EG-NEXT:    PAD
790; EG-NEXT:    ALU clause starting at 4:
791; EG-NEXT:     MOV * T0.Y, KC0[3].X,
792; EG-NEXT:     MOV T0.X, KC0[2].W,
793; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
794; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
795;
796; CM-LABEL: v2i32_arg:
797; CM:       ; %bb.0: ; %entry
798; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
799; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
800; CM-NEXT:    CF_END
801; CM-NEXT:    PAD
802; CM-NEXT:    ALU clause starting at 4:
803; CM-NEXT:     MOV * T0.Y, KC0[3].X,
804; CM-NEXT:     MOV * T0.X, KC0[2].W,
805; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
806; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
807entry:
808  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
809  ret void
810}
811
812define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
813; SI-LABEL: v2f32_arg:
814; SI:       ; %bb.0: ; %entry
815; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
816; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
817; SI-NEXT:    s_mov_b32 s3, 0xf000
818; SI-NEXT:    s_mov_b32 s2, -1
819; SI-NEXT:    s_waitcnt lgkmcnt(0)
820; SI-NEXT:    v_mov_b32_e32 v0, s4
821; SI-NEXT:    v_mov_b32_e32 v1, s5
822; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
823; SI-NEXT:    s_endpgm
824;
825; VI-LABEL: v2f32_arg:
826; VI:       ; %bb.0: ; %entry
827; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
828; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
829; VI-NEXT:    s_waitcnt lgkmcnt(0)
830; VI-NEXT:    v_mov_b32_e32 v0, s2
831; VI-NEXT:    v_mov_b32_e32 v3, s1
832; VI-NEXT:    v_mov_b32_e32 v1, s3
833; VI-NEXT:    v_mov_b32_e32 v2, s0
834; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
835; VI-NEXT:    s_endpgm
836;
837; GFX9-LABEL: v2f32_arg:
838; GFX9:       ; %bb.0: ; %entry
839; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
840; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
841; GFX9-NEXT:    v_mov_b32_e32 v2, 0
842; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX9-NEXT:    v_mov_b32_e32 v0, s2
844; GFX9-NEXT:    v_mov_b32_e32 v1, s3
845; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
846; GFX9-NEXT:    s_endpgm
847;
848; EG-LABEL: v2f32_arg:
849; EG:       ; %bb.0: ; %entry
850; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
851; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
852; EG-NEXT:    CF_END
853; EG-NEXT:    PAD
854; EG-NEXT:    ALU clause starting at 4:
855; EG-NEXT:     MOV * T0.Y, KC0[3].X,
856; EG-NEXT:     MOV T0.X, KC0[2].W,
857; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
858; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
859;
860; CM-LABEL: v2f32_arg:
861; CM:       ; %bb.0: ; %entry
862; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
863; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
864; CM-NEXT:    CF_END
865; CM-NEXT:    PAD
866; CM-NEXT:    ALU clause starting at 4:
867; CM-NEXT:     MOV * T0.Y, KC0[3].X,
868; CM-NEXT:     MOV * T0.X, KC0[2].W,
869; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
870; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
871entry:
872  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
873  ret void
874}
875
876define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
877; SI-LABEL: v3i8_arg:
878; SI:       ; %bb.0: ; %entry
879; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
880; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
881; SI-NEXT:    s_mov_b32 s3, 0xf000
882; SI-NEXT:    s_waitcnt lgkmcnt(0)
883; SI-NEXT:    s_lshr_b32 s5, s4, 16
884; SI-NEXT:    s_mov_b32 s2, -1
885; SI-NEXT:    v_mov_b32_e32 v0, s4
886; SI-NEXT:    v_mov_b32_e32 v1, s5
887; SI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
888; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
889; SI-NEXT:    s_endpgm
890;
891; VI-LABEL: v3i8_arg:
892; VI:       ; %bb.0: ; %entry
893; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
894; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
895; VI-NEXT:    s_waitcnt lgkmcnt(0)
896; VI-NEXT:    v_mov_b32_e32 v0, s2
897; VI-NEXT:    s_lshr_b32 s1, s0, 16
898; VI-NEXT:    v_mov_b32_e32 v4, s0
899; VI-NEXT:    s_add_u32 s0, s2, 2
900; VI-NEXT:    v_mov_b32_e32 v5, s1
901; VI-NEXT:    s_addc_u32 s1, s3, 0
902; VI-NEXT:    v_mov_b32_e32 v3, s1
903; VI-NEXT:    v_mov_b32_e32 v2, s0
904; VI-NEXT:    v_mov_b32_e32 v1, s3
905; VI-NEXT:    flat_store_byte v[2:3], v5
906; VI-NEXT:    flat_store_short v[0:1], v4
907; VI-NEXT:    s_endpgm
908;
909; GFX9-LABEL: v3i8_arg:
910; GFX9:       ; %bb.0: ; %entry
911; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
912; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
913; GFX9-NEXT:    v_mov_b32_e32 v0, 0
914; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NEXT:    v_mov_b32_e32 v1, s2
916; GFX9-NEXT:    global_store_byte_d16_hi v0, v1, s[0:1] offset:2
917; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
918; GFX9-NEXT:    s_endpgm
919;
920; EG-LABEL: v3i8_arg:
921; EG:       ; %bb.0: ; %entry
922; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
923; EG-NEXT:    TEX 2 @6
924; EG-NEXT:    ALU 28, @13, KC0[CB0:0-32], KC1[]
925; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
926; EG-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
927; EG-NEXT:    CF_END
928; EG-NEXT:    Fetch clause starting at 6:
929; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
930; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
931; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
932; EG-NEXT:    ALU clause starting at 12:
933; EG-NEXT:     MOV * T4.X, 0.0,
934; EG-NEXT:    ALU clause starting at 13:
935; EG-NEXT:     LSHL T0.W, T5.X, literal.x,
936; EG-NEXT:     AND_INT * T1.W, T4.X, literal.y,
937; EG-NEXT:    8(1.121039e-44), 255(3.573311e-43)
938; EG-NEXT:     AND_INT T2.W, KC0[2].Y, literal.x,
939; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
940; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
941; EG-NEXT:     AND_INT T0.W, PS, literal.x,
942; EG-NEXT:     LSHL * T1.W, PV.W, literal.y,
943; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
944; EG-NEXT:     LSHL T4.X, PV.W, PS,
945; EG-NEXT:     LSHL * T4.W, literal.x, PS,
946; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
947; EG-NEXT:     MOV T4.Y, 0.0,
948; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
949; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
950; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
951; EG-NEXT:     AND_INT * T2.W, T6.X, literal.y,
952; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
953; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
954; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
955; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
956; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
957; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
958; EG-NEXT:     MOV T5.Y, 0.0,
959; EG-NEXT:     MOV T4.Z, 0.0,
960; EG-NEXT:     MOV * T5.Z, 0.0,
961; EG-NEXT:     LSHR T6.X, T0.W, literal.x,
962; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
963; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
964;
965; CM-LABEL: v3i8_arg:
966; CM:       ; %bb.0: ; %entry
967; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
968; CM-NEXT:    TEX 2 @6
969; CM-NEXT:    ALU 29, @13, KC0[CB0:0-32], KC1[]
970; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
971; CM-NEXT:    MEM_RAT MSKOR T5.XW, T6.X
972; CM-NEXT:    CF_END
973; CM-NEXT:    Fetch clause starting at 6:
974; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 41, #3
975; CM-NEXT:     VTX_READ_8 T6.X, T4.X, 42, #3
976; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 40, #3
977; CM-NEXT:    ALU clause starting at 12:
978; CM-NEXT:     MOV * T4.X, 0.0,
979; CM-NEXT:    ALU clause starting at 13:
980; CM-NEXT:     LSHL T0.Z, T5.X, literal.x,
981; CM-NEXT:     AND_INT * T0.W, T4.X, literal.y, BS:VEC_120/SCL_212
982; CM-NEXT:    8(1.121039e-44), 255(3.573311e-43)
983; CM-NEXT:     AND_INT T1.Z, KC0[2].Y, literal.x,
984; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
985; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
986; CM-NEXT:     AND_INT T0.Z, PV.W, literal.x,
987; CM-NEXT:     LSHL * T0.W, PV.Z, literal.y,
988; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
989; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
990; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
991; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
992; CM-NEXT:     MOV T4.Y, 0.0,
993; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
994; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
995; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
996; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
997; CM-NEXT:     AND_INT T0.Z, T6.X, literal.x,
998; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
999; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1000; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1001; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1002; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1003; CM-NEXT:     MOV T5.Y, 0.0,
1004; CM-NEXT:     MOV * T4.Z, 0.0,
1005; CM-NEXT:     MOV * T5.Z, 0.0,
1006; CM-NEXT:     LSHR * T6.X, T0.W, literal.x,
1007; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1008; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1009; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1010entry:
1011  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
1012  ret void
1013}
1014
1015define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
1016; SI-LABEL: v3i16_arg:
1017; SI:       ; %bb.0: ; %entry
1018; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1019; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1020; SI-NEXT:    s_mov_b32 s3, 0xf000
1021; SI-NEXT:    s_mov_b32 s2, -1
1022; SI-NEXT:    s_waitcnt lgkmcnt(0)
1023; SI-NEXT:    v_mov_b32_e32 v0, s5
1024; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
1025; SI-NEXT:    s_waitcnt expcnt(0)
1026; SI-NEXT:    v_mov_b32_e32 v0, s4
1027; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1028; SI-NEXT:    s_endpgm
1029;
1030; VI-LABEL: v3i16_arg:
1031; VI:       ; %bb.0: ; %entry
1032; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1033; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1034; VI-NEXT:    s_waitcnt lgkmcnt(0)
1035; VI-NEXT:    s_add_u32 s4, s2, 4
1036; VI-NEXT:    s_addc_u32 s5, s3, 0
1037; VI-NEXT:    v_mov_b32_e32 v2, s4
1038; VI-NEXT:    v_mov_b32_e32 v0, s2
1039; VI-NEXT:    v_mov_b32_e32 v4, s1
1040; VI-NEXT:    v_mov_b32_e32 v3, s5
1041; VI-NEXT:    v_mov_b32_e32 v1, s3
1042; VI-NEXT:    v_mov_b32_e32 v5, s0
1043; VI-NEXT:    flat_store_short v[2:3], v4
1044; VI-NEXT:    flat_store_dword v[0:1], v5
1045; VI-NEXT:    s_endpgm
1046;
1047; GFX9-LABEL: v3i16_arg:
1048; GFX9:       ; %bb.0: ; %entry
1049; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1050; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1051; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1054; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1055; GFX9-NEXT:    global_store_short v0, v1, s[0:1] offset:4
1056; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1057; GFX9-NEXT:    s_endpgm
1058;
1059; EG-LABEL: v3i16_arg:
1060; EG:       ; %bb.0: ; %entry
1061; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
1062; EG-NEXT:    TEX 2 @6
1063; EG-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1064; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
1065; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1066; EG-NEXT:    CF_END
1067; EG-NEXT:    Fetch clause starting at 6:
1068; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1069; EG-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1070; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1071; EG-NEXT:    ALU clause starting at 12:
1072; EG-NEXT:     MOV * T5.X, 0.0,
1073; EG-NEXT:    ALU clause starting at 13:
1074; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1075; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1076; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1077; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1078; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1079; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1080; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1081; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1082; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1083; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1084; EG-NEXT:     MOV T5.Y, 0.0,
1085; EG-NEXT:     MOV * T5.Z, 0.0,
1086; EG-NEXT:     LSHR T8.X, T0.W, literal.x,
1087; EG-NEXT:     LSHL T0.W, T7.X, literal.y,
1088; EG-NEXT:     AND_INT * T1.W, T6.X, literal.z,
1089; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
1090; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1091; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1092; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1093; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1094;
1095; CM-LABEL: v3i16_arg:
1096; CM:       ; %bb.0: ; %entry
1097; CM-NEXT:    ALU 0, @12, KC0[], KC1[]
1098; CM-NEXT:    TEX 2 @6
1099; CM-NEXT:    ALU 19, @13, KC0[CB0:0-32], KC1[]
1100; CM-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1101; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
1102; CM-NEXT:    CF_END
1103; CM-NEXT:    Fetch clause starting at 6:
1104; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 44, #3
1105; CM-NEXT:     VTX_READ_16 T7.X, T5.X, 46, #3
1106; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 48, #3
1107; CM-NEXT:    ALU clause starting at 12:
1108; CM-NEXT:     MOV * T5.X, 0.0,
1109; CM-NEXT:    ALU clause starting at 13:
1110; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1111; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1112; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1113; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1114; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1115; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1116; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1117; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1118; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1119; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1120; CM-NEXT:     MOV T5.Y, 0.0,
1121; CM-NEXT:     MOV * T5.Z, 0.0,
1122; CM-NEXT:     LSHL T0.Z, T7.X, literal.x,
1123; CM-NEXT:     AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
1124; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1125; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1126; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1127; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1128; CM-NEXT:     LSHR * T8.X, T0.W, literal.x,
1129; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1130entry:
1131  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
1132  ret void
1133}
1134
1135define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
1136; SI-LABEL: v3i32_arg:
1137; SI:       ; %bb.0: ; %entry
1138; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1139; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1140; SI-NEXT:    s_mov_b32 s3, 0xf000
1141; SI-NEXT:    s_mov_b32 s2, -1
1142; SI-NEXT:    s_waitcnt lgkmcnt(0)
1143; SI-NEXT:    v_mov_b32_e32 v0, s6
1144; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
1145; SI-NEXT:    s_waitcnt expcnt(0)
1146; SI-NEXT:    v_mov_b32_e32 v0, s4
1147; SI-NEXT:    v_mov_b32_e32 v1, s5
1148; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1149; SI-NEXT:    s_endpgm
1150;
1151; VI-LABEL: v3i32_arg:
1152; VI:       ; %bb.0: ; %entry
1153; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1154; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1155; VI-NEXT:    s_waitcnt lgkmcnt(0)
1156; VI-NEXT:    v_mov_b32_e32 v3, s4
1157; VI-NEXT:    v_mov_b32_e32 v0, s0
1158; VI-NEXT:    v_mov_b32_e32 v1, s1
1159; VI-NEXT:    v_mov_b32_e32 v2, s2
1160; VI-NEXT:    v_mov_b32_e32 v4, s5
1161; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1162; VI-NEXT:    s_endpgm
1163;
1164; GFX9-LABEL: v3i32_arg:
1165; GFX9:       ; %bb.0: ; %entry
1166; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1167; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1168; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1169; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1171; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1172; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1173; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
1174; GFX9-NEXT:    s_endpgm
1175;
1176; EG-LABEL: v3i32_arg:
1177; EG:       ; %bb.0: ; %entry
1178; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1179; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1180; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1181; EG-NEXT:    CF_END
1182; EG-NEXT:    ALU clause starting at 4:
1183; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1184; EG-NEXT:     MOV T0.X, KC0[3].Y,
1185; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1186; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1187; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1188; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1189; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1190; EG-NEXT:     MOV * T3.X, KC0[3].W,
1191; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1192;
1193; CM-LABEL: v3i32_arg:
1194; CM:       ; %bb.0: ; %entry
1195; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1196; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1197; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1198; CM-NEXT:    CF_END
1199; CM-NEXT:    ALU clause starting at 4:
1200; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1201; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1202; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1203; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1204; CM-NEXT:     MOV T1.X, KC0[3].W,
1205; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1206; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1207; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1208; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1209entry:
1210  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
1211  ret void
1212}
1213
1214define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
1215; SI-LABEL: v3f32_arg:
1216; SI:       ; %bb.0: ; %entry
1217; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1218; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1219; SI-NEXT:    s_mov_b32 s3, 0xf000
1220; SI-NEXT:    s_mov_b32 s2, -1
1221; SI-NEXT:    s_waitcnt lgkmcnt(0)
1222; SI-NEXT:    v_mov_b32_e32 v0, s6
1223; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
1224; SI-NEXT:    s_waitcnt expcnt(0)
1225; SI-NEXT:    v_mov_b32_e32 v0, s4
1226; SI-NEXT:    v_mov_b32_e32 v1, s5
1227; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1228; SI-NEXT:    s_endpgm
1229;
1230; VI-LABEL: v3f32_arg:
1231; VI:       ; %bb.0: ; %entry
1232; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1233; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1234; VI-NEXT:    s_waitcnt lgkmcnt(0)
1235; VI-NEXT:    v_mov_b32_e32 v3, s4
1236; VI-NEXT:    v_mov_b32_e32 v0, s0
1237; VI-NEXT:    v_mov_b32_e32 v1, s1
1238; VI-NEXT:    v_mov_b32_e32 v2, s2
1239; VI-NEXT:    v_mov_b32_e32 v4, s5
1240; VI-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
1241; VI-NEXT:    s_endpgm
1242;
1243; GFX9-LABEL: v3f32_arg:
1244; GFX9:       ; %bb.0: ; %entry
1245; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1246; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1247; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1250; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1251; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1252; GFX9-NEXT:    global_store_dwordx3 v3, v[0:2], s[6:7]
1253; GFX9-NEXT:    s_endpgm
1254;
1255; EG-LABEL: v3f32_arg:
1256; EG:       ; %bb.0: ; %entry
1257; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1258; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
1259; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1260; EG-NEXT:    CF_END
1261; EG-NEXT:    ALU clause starting at 4:
1262; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1263; EG-NEXT:     MOV T0.X, KC0[3].Y,
1264; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1265; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1266; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1267; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1268; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1269; EG-NEXT:     MOV * T3.X, KC0[3].W,
1270; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1271;
1272; CM-LABEL: v3f32_arg:
1273; CM:       ; %bb.0: ; %entry
1274; CM-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
1275; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T3.X
1276; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T0.X
1277; CM-NEXT:    CF_END
1278; CM-NEXT:    ALU clause starting at 4:
1279; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1280; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1281; CM-NEXT:     LSHR * T0.X, PV.W, literal.x,
1282; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1283; CM-NEXT:     MOV T1.X, KC0[3].W,
1284; CM-NEXT:     MOV * T2.Y, KC0[3].Z,
1285; CM-NEXT:     MOV * T2.X, KC0[3].Y,
1286; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
1287; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1288entry:
1289  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
1290  ret void
1291}
1292
1293define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
1294; SI-LABEL: v4i8_arg:
1295; SI:       ; %bb.0: ; %entry
1296; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
1297; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1298; SI-NEXT:    s_mov_b32 s3, 0xf000
1299; SI-NEXT:    s_mov_b32 s2, -1
1300; SI-NEXT:    s_waitcnt lgkmcnt(0)
1301; SI-NEXT:    v_mov_b32_e32 v0, s4
1302; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1303; SI-NEXT:    s_endpgm
1304;
1305; VI-LABEL: v4i8_arg:
1306; VI:       ; %bb.0: ; %entry
1307; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1308; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
1309; VI-NEXT:    s_waitcnt lgkmcnt(0)
1310; VI-NEXT:    v_mov_b32_e32 v0, s2
1311; VI-NEXT:    v_mov_b32_e32 v1, s3
1312; VI-NEXT:    v_mov_b32_e32 v2, s0
1313; VI-NEXT:    flat_store_dword v[0:1], v2
1314; VI-NEXT:    s_endpgm
1315;
1316; GFX9-LABEL: v4i8_arg:
1317; GFX9:       ; %bb.0: ; %entry
1318; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1319; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
1320; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1321; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1322; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1323; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1324; GFX9-NEXT:    s_endpgm
1325;
1326; EG-LABEL: v4i8_arg:
1327; EG:       ; %bb.0: ; %entry
1328; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1329; EG-NEXT:    TEX 3 @6
1330; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
1331; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 1
1332; EG-NEXT:    CF_END
1333; EG-NEXT:    PAD
1334; EG-NEXT:    Fetch clause starting at 6:
1335; EG-NEXT:     VTX_READ_8 T5.X, T4.X, 42, #3
1336; EG-NEXT:     VTX_READ_8 T6.X, T4.X, 40, #3
1337; EG-NEXT:     VTX_READ_8 T7.X, T4.X, 43, #3
1338; EG-NEXT:     VTX_READ_8 T4.X, T4.X, 41, #3
1339; EG-NEXT:    ALU clause starting at 14:
1340; EG-NEXT:     MOV * T4.X, 0.0,
1341; EG-NEXT:    ALU clause starting at 15:
1342; EG-NEXT:     AND_INT * T0.W, T5.X, literal.x,
1343; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1344; EG-NEXT:     AND_INT T0.Z, T4.X, literal.x,
1345; EG-NEXT:     LSHL T0.W, PV.W, literal.y,
1346; EG-NEXT:     LSHL * T1.W, T7.X, literal.z,
1347; EG-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1348; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1349; EG-NEXT:     OR_INT T0.W, PS, PV.W,
1350; EG-NEXT:     LSHL * T1.W, PV.Z, literal.x,
1351; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1352; EG-NEXT:     OR_INT T0.W, PV.W, PS,
1353; EG-NEXT:     AND_INT * T1.W, T6.X, literal.x,
1354; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1355; EG-NEXT:     OR_INT T4.X, PV.W, PS,
1356; EG-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
1357; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1358;
1359; CM-LABEL: v4i8_arg:
1360; CM:       ; %bb.0: ; %entry
1361; CM-NEXT:    ALU 0, @14, KC0[], KC1[]
1362; CM-NEXT:    TEX 3 @6
1363; CM-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
1364; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T5.X
1365; CM-NEXT:    CF_END
1366; CM-NEXT:    PAD
1367; CM-NEXT:    Fetch clause starting at 6:
1368; CM-NEXT:     VTX_READ_8 T5.X, T4.X, 42, #3
1369; CM-NEXT:     VTX_READ_8 T6.X, T4.X, 40, #3
1370; CM-NEXT:     VTX_READ_8 T7.X, T4.X, 43, #3
1371; CM-NEXT:     VTX_READ_8 T4.X, T4.X, 41, #3
1372; CM-NEXT:    ALU clause starting at 14:
1373; CM-NEXT:     MOV * T4.X, 0.0,
1374; CM-NEXT:    ALU clause starting at 15:
1375; CM-NEXT:     AND_INT * T0.W, T5.X, literal.x,
1376; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1377; CM-NEXT:     AND_INT T0.Y, T4.X, literal.x,
1378; CM-NEXT:     LSHL T0.Z, PV.W, literal.y,
1379; CM-NEXT:     LSHL * T0.W, T7.X, literal.z, BS:VEC_120/SCL_212
1380; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1381; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1382; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
1383; CM-NEXT:     LSHL * T0.W, PV.Y, literal.x,
1384; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1385; CM-NEXT:     OR_INT T0.Z, PV.Z, PV.W,
1386; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
1387; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1388; CM-NEXT:     OR_INT * T4.X, PV.Z, PV.W,
1389; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
1390; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1391entry:
1392  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
1393  ret void
1394}
1395
1396define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
1397; SI-LABEL: v4i16_arg:
1398; SI:       ; %bb.0: ; %entry
1399; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
1400; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1401; SI-NEXT:    s_mov_b32 s3, 0xf000
1402; SI-NEXT:    s_mov_b32 s2, -1
1403; SI-NEXT:    s_waitcnt lgkmcnt(0)
1404; SI-NEXT:    v_mov_b32_e32 v0, s4
1405; SI-NEXT:    v_mov_b32_e32 v1, s5
1406; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1407; SI-NEXT:    s_endpgm
1408;
1409; VI-LABEL: v4i16_arg:
1410; VI:       ; %bb.0: ; %entry
1411; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1412; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
1413; VI-NEXT:    s_waitcnt lgkmcnt(0)
1414; VI-NEXT:    v_mov_b32_e32 v0, s2
1415; VI-NEXT:    v_mov_b32_e32 v3, s1
1416; VI-NEXT:    v_mov_b32_e32 v1, s3
1417; VI-NEXT:    v_mov_b32_e32 v2, s0
1418; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1419; VI-NEXT:    s_endpgm
1420;
1421; GFX9-LABEL: v4i16_arg:
1422; GFX9:       ; %bb.0: ; %entry
1423; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1424; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1425; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1426; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1427; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1428; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1429; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1430; GFX9-NEXT:    s_endpgm
1431;
1432; EG-LABEL: v4i16_arg:
1433; EG:       ; %bb.0: ; %entry
1434; EG-NEXT:    ALU 1, @20, KC0[], KC1[]
1435; EG-NEXT:    TEX 0 @12
1436; EG-NEXT:    ALU 5, @22, KC0[], KC1[]
1437; EG-NEXT:    TEX 0 @14
1438; EG-NEXT:    ALU 5, @28, KC0[], KC1[]
1439; EG-NEXT:    TEX 0 @16
1440; EG-NEXT:    ALU 5, @34, KC0[], KC1[]
1441; EG-NEXT:    TEX 0 @18
1442; EG-NEXT:    ALU 7, @40, KC0[CB0:0-32], KC1[]
1443; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
1444; EG-NEXT:    CF_END
1445; EG-NEXT:    PAD
1446; EG-NEXT:    Fetch clause starting at 12:
1447; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
1448; EG-NEXT:    Fetch clause starting at 14:
1449; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
1450; EG-NEXT:    Fetch clause starting at 16:
1451; EG-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
1452; EG-NEXT:    Fetch clause starting at 18:
1453; EG-NEXT:     VTX_READ_16 T5.X, T5.X, 44, #3
1454; EG-NEXT:    ALU clause starting at 20:
1455; EG-NEXT:     MOV * T0.Y, T3.X,
1456; EG-NEXT:     MOV * T5.X, 0.0,
1457; EG-NEXT:    ALU clause starting at 22:
1458; EG-NEXT:     LSHL T0.W, T6.X, literal.x,
1459; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
1460; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1461; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
1462; EG-NEXT:     MOV * T3.X, PV.W,
1463; EG-NEXT:     MOV * T0.Y, PV.X,
1464; EG-NEXT:    ALU clause starting at 28:
1465; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
1466; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
1467; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
1468; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
1469; EG-NEXT:     MOV T3.X, PV.W,
1470; EG-NEXT:     MOV * T0.Y, T2.X,
1471; EG-NEXT:    ALU clause starting at 34:
1472; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
1473; EG-NEXT:     LSHL * T1.W, T6.X, literal.y,
1474; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1475; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
1476; EG-NEXT:     MOV * T2.X, PV.W,
1477; EG-NEXT:     MOV * T0.Y, PV.X,
1478; EG-NEXT:    ALU clause starting at 40:
1479; EG-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
1480; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
1481; EG-NEXT:     AND_INT * T1.W, T5.X, literal.z,
1482; EG-NEXT:    2(2.802597e-45), -65536(nan)
1483; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1484; EG-NEXT:     OR_INT * T5.X, PV.W, PS,
1485; EG-NEXT:     MOV T2.X, PV.X,
1486; EG-NEXT:     MOV * T5.Y, T3.X,
1487;
1488; CM-LABEL: v4i16_arg:
1489; CM:       ; %bb.0: ; %entry
1490; CM-NEXT:    ALU 1, @20, KC0[], KC1[]
1491; CM-NEXT:    TEX 0 @12
1492; CM-NEXT:    ALU 5, @22, KC0[], KC1[]
1493; CM-NEXT:    TEX 0 @14
1494; CM-NEXT:    ALU 5, @28, KC0[], KC1[]
1495; CM-NEXT:    TEX 0 @16
1496; CM-NEXT:    ALU 5, @34, KC0[], KC1[]
1497; CM-NEXT:    TEX 0 @18
1498; CM-NEXT:    ALU 7, @40, KC0[CB0:0-32], KC1[]
1499; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
1500; CM-NEXT:    CF_END
1501; CM-NEXT:    PAD
1502; CM-NEXT:    Fetch clause starting at 12:
1503; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 50, #3
1504; CM-NEXT:    Fetch clause starting at 14:
1505; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 48, #3
1506; CM-NEXT:    Fetch clause starting at 16:
1507; CM-NEXT:     VTX_READ_16 T6.X, T5.X, 46, #3
1508; CM-NEXT:    Fetch clause starting at 18:
1509; CM-NEXT:     VTX_READ_16 T5.X, T5.X, 44, #3
1510; CM-NEXT:    ALU clause starting at 20:
1511; CM-NEXT:     MOV * T0.Y, T3.X,
1512; CM-NEXT:     MOV * T5.X, 0.0,
1513; CM-NEXT:    ALU clause starting at 22:
1514; CM-NEXT:     LSHL T0.Z, T6.X, literal.x,
1515; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
1516; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
1517; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
1518; CM-NEXT:     MOV * T3.X, PV.W,
1519; CM-NEXT:     MOV * T0.Y, PV.X,
1520; CM-NEXT:    ALU clause starting at 28:
1521; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
1522; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
1523; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
1524; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
1525; CM-NEXT:     MOV T3.X, PV.W,
1526; CM-NEXT:     MOV * T0.Y, T2.X,
1527; CM-NEXT:    ALU clause starting at 34:
1528; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
1529; CM-NEXT:     LSHL * T0.W, T6.X, literal.y,
1530; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
1531; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
1532; CM-NEXT:     MOV * T2.X, PV.W,
1533; CM-NEXT:     MOV * T0.Y, PV.X,
1534; CM-NEXT:    ALU clause starting at 40:
1535; CM-NEXT:     LSHR T6.X, KC0[2].Y, literal.x,
1536; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
1537; CM-NEXT:     AND_INT * T0.W, T5.X, literal.z,
1538; CM-NEXT:    2(2.802597e-45), -65536(nan)
1539; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1540; CM-NEXT:     OR_INT * T5.X, PV.Z, PV.W,
1541; CM-NEXT:     MOV T2.X, PV.X,
1542; CM-NEXT:     MOV * T5.Y, T3.X,
1543entry:
1544  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
1545  ret void
1546}
1547
1548define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
1549; SI-LABEL: v4i32_arg:
1550; SI:       ; %bb.0: ; %entry
1551; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1552; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1553; SI-NEXT:    s_mov_b32 s3, 0xf000
1554; SI-NEXT:    s_mov_b32 s2, -1
1555; SI-NEXT:    s_waitcnt lgkmcnt(0)
1556; SI-NEXT:    v_mov_b32_e32 v0, s4
1557; SI-NEXT:    v_mov_b32_e32 v1, s5
1558; SI-NEXT:    v_mov_b32_e32 v2, s6
1559; SI-NEXT:    v_mov_b32_e32 v3, s7
1560; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1561; SI-NEXT:    s_endpgm
1562;
1563; VI-LABEL: v4i32_arg:
1564; VI:       ; %bb.0: ; %entry
1565; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1566; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1567; VI-NEXT:    s_waitcnt lgkmcnt(0)
1568; VI-NEXT:    v_mov_b32_e32 v4, s4
1569; VI-NEXT:    v_mov_b32_e32 v0, s0
1570; VI-NEXT:    v_mov_b32_e32 v5, s5
1571; VI-NEXT:    v_mov_b32_e32 v1, s1
1572; VI-NEXT:    v_mov_b32_e32 v2, s2
1573; VI-NEXT:    v_mov_b32_e32 v3, s3
1574; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1575; VI-NEXT:    s_endpgm
1576;
1577; GFX9-LABEL: v4i32_arg:
1578; GFX9:       ; %bb.0: ; %entry
1579; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1580; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1581; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1583; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1584; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1585; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1586; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1587; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
1588; GFX9-NEXT:    s_endpgm
1589;
1590; EG-LABEL: v4i32_arg:
1591; EG:       ; %bb.0: ; %entry
1592; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1594; EG-NEXT:    CF_END
1595; EG-NEXT:    PAD
1596; EG-NEXT:    ALU clause starting at 4:
1597; EG-NEXT:     MOV * T0.W, KC0[4].X,
1598; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1599; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1600; EG-NEXT:     MOV T0.X, KC0[3].Y,
1601; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1603;
1604; CM-LABEL: v4i32_arg:
1605; CM:       ; %bb.0: ; %entry
1606; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1607; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1608; CM-NEXT:    CF_END
1609; CM-NEXT:    PAD
1610; CM-NEXT:    ALU clause starting at 4:
1611; CM-NEXT:     MOV * T0.W, KC0[4].X,
1612; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1613; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1614; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1615; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1616; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1617entry:
1618  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
1619  ret void
1620}
1621
1622define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
1623; SI-LABEL: v4f32_arg:
1624; SI:       ; %bb.0: ; %entry
1625; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1626; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1627; SI-NEXT:    s_mov_b32 s3, 0xf000
1628; SI-NEXT:    s_mov_b32 s2, -1
1629; SI-NEXT:    s_waitcnt lgkmcnt(0)
1630; SI-NEXT:    v_mov_b32_e32 v0, s4
1631; SI-NEXT:    v_mov_b32_e32 v1, s5
1632; SI-NEXT:    v_mov_b32_e32 v2, s6
1633; SI-NEXT:    v_mov_b32_e32 v3, s7
1634; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1635; SI-NEXT:    s_endpgm
1636;
1637; VI-LABEL: v4f32_arg:
1638; VI:       ; %bb.0: ; %entry
1639; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
1640; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
1641; VI-NEXT:    s_waitcnt lgkmcnt(0)
1642; VI-NEXT:    v_mov_b32_e32 v4, s4
1643; VI-NEXT:    v_mov_b32_e32 v0, s0
1644; VI-NEXT:    v_mov_b32_e32 v5, s5
1645; VI-NEXT:    v_mov_b32_e32 v1, s1
1646; VI-NEXT:    v_mov_b32_e32 v2, s2
1647; VI-NEXT:    v_mov_b32_e32 v3, s3
1648; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
1649; VI-NEXT:    s_endpgm
1650;
1651; GFX9-LABEL: v4f32_arg:
1652; GFX9:       ; %bb.0: ; %entry
1653; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
1654; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
1655; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1657; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1658; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1659; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1660; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1661; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
1662; GFX9-NEXT:    s_endpgm
1663;
1664; EG-LABEL: v4f32_arg:
1665; EG:       ; %bb.0: ; %entry
1666; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1667; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
1668; EG-NEXT:    CF_END
1669; EG-NEXT:    PAD
1670; EG-NEXT:    ALU clause starting at 4:
1671; EG-NEXT:     MOV * T0.W, KC0[4].X,
1672; EG-NEXT:     MOV * T0.Z, KC0[3].W,
1673; EG-NEXT:     MOV * T0.Y, KC0[3].Z,
1674; EG-NEXT:     MOV T0.X, KC0[3].Y,
1675; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1676; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1677;
1678; CM-LABEL: v4f32_arg:
1679; CM:       ; %bb.0: ; %entry
1680; CM-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
1681; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
1682; CM-NEXT:    CF_END
1683; CM-NEXT:    PAD
1684; CM-NEXT:    ALU clause starting at 4:
1685; CM-NEXT:     MOV * T0.W, KC0[4].X,
1686; CM-NEXT:     MOV * T0.Z, KC0[3].W,
1687; CM-NEXT:     MOV * T0.Y, KC0[3].Z,
1688; CM-NEXT:     MOV * T0.X, KC0[3].Y,
1689; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1690; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1691entry:
1692  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
1693  ret void
1694}
1695
1696define amdgpu_kernel void @v5i8_arg(<5 x i8> addrspace(1)* nocapture %out, <5 x i8> %in) nounwind {
1697; SI-LABEL: v5i8_arg:
1698; SI:       ; %bb.0: ; %entry
1699; SI-NEXT:    s_load_dword s2, s[0:1], 0xc
1700; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1701; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
1702; SI-NEXT:    s_mov_b32 s7, 0xf000
1703; SI-NEXT:    s_mov_b32 s6, -1
1704; SI-NEXT:    s_waitcnt lgkmcnt(0)
1705; SI-NEXT:    v_mov_b32_e32 v0, s2
1706; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:4
1707; SI-NEXT:    s_waitcnt expcnt(0)
1708; SI-NEXT:    v_mov_b32_e32 v0, s0
1709; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1710; SI-NEXT:    s_endpgm
1711;
1712; VI-LABEL: v5i8_arg:
1713; VI:       ; %bb.0: ; %entry
1714; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1715; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
1716; VI-NEXT:    s_load_dword s1, s[0:1], 0x30
1717; VI-NEXT:    s_waitcnt lgkmcnt(0)
1718; VI-NEXT:    s_add_u32 s0, s2, 4
1719; VI-NEXT:    v_mov_b32_e32 v0, s2
1720; VI-NEXT:    v_mov_b32_e32 v4, s1
1721; VI-NEXT:    s_addc_u32 s1, s3, 0
1722; VI-NEXT:    v_mov_b32_e32 v3, s1
1723; VI-NEXT:    v_mov_b32_e32 v2, s0
1724; VI-NEXT:    flat_store_byte v[2:3], v4
1725; VI-NEXT:    v_mov_b32_e32 v1, s3
1726; VI-NEXT:    v_mov_b32_e32 v2, s4
1727; VI-NEXT:    flat_store_dword v[0:1], v2
1728; VI-NEXT:    s_endpgm
1729;
1730; GFX9-LABEL: v5i8_arg:
1731; GFX9:       ; %bb.0: ; %entry
1732; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1733; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
1734; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1735; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1736; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1737; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1738; GFX9-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
1739; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1740; GFX9-NEXT:    s_endpgm
1741;
1742; EG-LABEL: v5i8_arg:
1743; EG:       ; %bb.0: ; %entry
1744; EG-NEXT:    ALU 0, @16, KC0[], KC1[]
1745; EG-NEXT:    TEX 4 @6
1746; EG-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1747; EG-NEXT:    MEM_RAT MSKOR T5.XW, T8.X
1748; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
1749; EG-NEXT:    CF_END
1750; EG-NEXT:    Fetch clause starting at 6:
1751; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1752; EG-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1753; EG-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1754; EG-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1755; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1756; EG-NEXT:    ALU clause starting at 16:
1757; EG-NEXT:     MOV * T5.X, 0.0,
1758; EG-NEXT:    ALU clause starting at 17:
1759; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1760; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1761; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1762; EG-NEXT:     AND_INT * T2.W, T5.X, literal.y,
1763; EG-NEXT:    3(4.203895e-45), 255(3.573311e-43)
1764; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1765; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1766; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1767; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1768; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1769; EG-NEXT:     MOV T5.Y, 0.0,
1770; EG-NEXT:     MOV T5.Z, 0.0,
1771; EG-NEXT:     AND_INT T1.W, T9.X, literal.x,
1772; EG-NEXT:     AND_INT * T0.Z, T8.X, literal.x,
1773; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1774; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
1775; EG-NEXT:     LSHL * T2.W, T7.X, literal.y,
1776; EG-NEXT:    16(2.242078e-44), 24(3.363116e-44)
1777; EG-NEXT:     OR_INT T1.W, PS, PV.W,
1778; EG-NEXT:     LSHL * T2.W, T0.Z, literal.x,
1779; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1780; EG-NEXT:     OR_INT T1.W, PV.W, PS,
1781; EG-NEXT:     AND_INT * T2.W, T6.X, literal.x,
1782; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1783; EG-NEXT:     OR_INT T6.X, PV.W, PS,
1784; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1785; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1786; EG-NEXT:     LSHR * T8.X, T0.W, literal.x,
1787; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1788;
1789; CM-LABEL: v5i8_arg:
1790; CM:       ; %bb.0: ; %entry
1791; CM-NEXT:    ALU 0, @16, KC0[], KC1[]
1792; CM-NEXT:    TEX 4 @6
1793; CM-NEXT:    ALU 28, @17, KC0[CB0:0-32], KC1[]
1794; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T8.X
1795; CM-NEXT:    MEM_RAT MSKOR T5.XW, T7.X
1796; CM-NEXT:    CF_END
1797; CM-NEXT:    Fetch clause starting at 6:
1798; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 44, #3
1799; CM-NEXT:     VTX_READ_8 T7.X, T5.X, 47, #3
1800; CM-NEXT:     VTX_READ_8 T8.X, T5.X, 45, #3
1801; CM-NEXT:     VTX_READ_8 T9.X, T5.X, 46, #3
1802; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 48, #3
1803; CM-NEXT:    ALU clause starting at 16:
1804; CM-NEXT:     MOV * T5.X, 0.0,
1805; CM-NEXT:    ALU clause starting at 17:
1806; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1807; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1808; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
1809; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1810; CM-NEXT:     AND_INT T0.Z, T5.X, literal.x,
1811; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
1812; CM-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1813; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
1814; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
1815; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1816; CM-NEXT:     MOV T5.Y, 0.0,
1817; CM-NEXT:     MOV T5.Z, 0.0,
1818; CM-NEXT:     AND_INT * T1.W, T9.X, literal.x,
1819; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1820; CM-NEXT:     AND_INT T0.Y, T8.X, literal.x,
1821; CM-NEXT:     LSHL T0.Z, PV.W, literal.y,
1822; CM-NEXT:     LSHL * T1.W, T7.X, literal.z, BS:VEC_120/SCL_212
1823; CM-NEXT:    255(3.573311e-43), 16(2.242078e-44)
1824; CM-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1825; CM-NEXT:     OR_INT T0.Z, PV.W, PV.Z,
1826; CM-NEXT:     LSHL * T1.W, PV.Y, literal.x,
1827; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1828; CM-NEXT:     LSHR T7.X, T0.W, literal.x,
1829; CM-NEXT:     OR_INT T0.Z, PV.Z, PV.W,
1830; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
1831; CM-NEXT:    2(2.802597e-45), 255(3.573311e-43)
1832; CM-NEXT:     OR_INT * T6.X, PV.Z, PV.W,
1833; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
1834; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1835entry:
1836  store <5 x i8> %in, <5 x i8> addrspace(1)* %out, align 4
1837  ret void
1838}
1839
1840define amdgpu_kernel void @v5i16_arg(<5 x i16> addrspace(1)* nocapture %out, <5 x i16> %in) nounwind {
1841; SI-LABEL: v5i16_arg:
1842; SI:       ; %bb.0: ; %entry
1843; SI-NEXT:    s_load_dword s2, s[0:1], 0xf
1844; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1845; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1846; SI-NEXT:    s_mov_b32 s7, 0xf000
1847; SI-NEXT:    s_mov_b32 s6, -1
1848; SI-NEXT:    s_waitcnt lgkmcnt(0)
1849; SI-NEXT:    v_mov_b32_e32 v0, s2
1850; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:8
1851; SI-NEXT:    s_waitcnt expcnt(0)
1852; SI-NEXT:    v_mov_b32_e32 v0, s0
1853; SI-NEXT:    v_mov_b32_e32 v1, s1
1854; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1855; SI-NEXT:    s_endpgm
1856;
1857; VI-LABEL: v5i16_arg:
1858; VI:       ; %bb.0: ; %entry
1859; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1860; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x34
1861; VI-NEXT:    s_load_dword s1, s[0:1], 0x3c
1862; VI-NEXT:    s_waitcnt lgkmcnt(0)
1863; VI-NEXT:    s_add_u32 s0, s2, 8
1864; VI-NEXT:    v_mov_b32_e32 v0, s2
1865; VI-NEXT:    v_mov_b32_e32 v4, s1
1866; VI-NEXT:    s_addc_u32 s1, s3, 0
1867; VI-NEXT:    v_mov_b32_e32 v3, s1
1868; VI-NEXT:    v_mov_b32_e32 v2, s0
1869; VI-NEXT:    flat_store_short v[2:3], v4
1870; VI-NEXT:    v_mov_b32_e32 v2, s4
1871; VI-NEXT:    v_mov_b32_e32 v1, s3
1872; VI-NEXT:    v_mov_b32_e32 v3, s5
1873; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
1874; VI-NEXT:    s_endpgm
1875;
1876; GFX9-LABEL: v5i16_arg:
1877; GFX9:       ; %bb.0: ; %entry
1878; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1879; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x10
1880; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x18
1881; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1882; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1883; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1884; GFX9-NEXT:    v_mov_b32_e32 v3, s6
1885; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1886; GFX9-NEXT:    global_store_short v2, v3, s[0:1] offset:8
1887; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1888; GFX9-NEXT:    s_endpgm
1889;
1890; EG-LABEL: v5i16_arg:
1891; EG:       ; %bb.0: ; %entry
1892; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1893; EG-NEXT:    TEX 4 @10
1894; EG-NEXT:    ALU 65, @21, KC0[CB0:0-32], KC1[]
1895; EG-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1896; EG-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1897; EG-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1898; EG-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1899; EG-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1900; EG-NEXT:    CF_END
1901; EG-NEXT:    PAD
1902; EG-NEXT:    Fetch clause starting at 10:
1903; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1904; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1905; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1906; EG-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1907; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1908; EG-NEXT:    ALU clause starting at 20:
1909; EG-NEXT:     MOV * T0.X, 0.0,
1910; EG-NEXT:    ALU clause starting at 21:
1911; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1912; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
1913; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1914; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
1915; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1916; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1917; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1918; EG-NEXT:     LSHL T5.X, T2.W, PV.W,
1919; EG-NEXT:     LSHL * T5.W, literal.x, PV.W,
1920; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1921; EG-NEXT:     MOV T5.Y, 0.0,
1922; EG-NEXT:     AND_INT T1.W, KC0[2].Y, literal.x,
1923; EG-NEXT:     AND_INT * T2.W, T4.X, literal.y,
1924; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1925; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
1926; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1927; EG-NEXT:     LSHL T4.X, T2.W, PV.W,
1928; EG-NEXT:     LSHL * T4.W, literal.x, PV.W,
1929; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1930; EG-NEXT:     MOV T4.Y, 0.0,
1931; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
1932; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1933; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
1934; EG-NEXT:     AND_INT * T3.W, T3.X, literal.y,
1935; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1936; EG-NEXT:     LSHL * T2.W, PV.W, literal.x,
1937; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1938; EG-NEXT:     LSHL T3.X, T3.W, PV.W,
1939; EG-NEXT:     LSHL * T3.W, literal.x, PV.W,
1940; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1941; EG-NEXT:     MOV T3.Y, 0.0,
1942; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
1943; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
1944; EG-NEXT:     AND_INT T6.W, PV.W, literal.x,
1945; EG-NEXT:     AND_INT * T7.W, T2.X, literal.y,
1946; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1947; EG-NEXT:     LSHL * T6.W, PV.W, literal.x,
1948; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1949; EG-NEXT:     LSHL T6.X, T7.W, PV.W,
1950; EG-NEXT:     LSHL * T6.W, literal.x, PV.W,
1951; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1952; EG-NEXT:     MOV T6.Y, 0.0,
1953; EG-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
1954; EG-NEXT:    6(8.407791e-45), 0(0.000000e+00)
1955; EG-NEXT:     AND_INT T8.W, PV.W, literal.x,
1956; EG-NEXT:     AND_INT * T9.W, T1.X, literal.y,
1957; EG-NEXT:    3(4.203895e-45), 65535(9.183409e-41)
1958; EG-NEXT:     LSHL * T8.W, PV.W, literal.x,
1959; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1960; EG-NEXT:     LSHL T8.X, T9.W, PV.W,
1961; EG-NEXT:     LSHL * T8.W, literal.x, PV.W,
1962; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1963; EG-NEXT:     MOV T8.Y, 0.0,
1964; EG-NEXT:     MOV T5.Z, 0.0,
1965; EG-NEXT:     MOV * T4.Z, 0.0,
1966; EG-NEXT:     MOV T3.Z, 0.0,
1967; EG-NEXT:     MOV * T6.Z, 0.0,
1968; EG-NEXT:     MOV * T8.Z, 0.0,
1969; EG-NEXT:     LSHR T0.X, T7.W, literal.x,
1970; EG-NEXT:     LSHR * T1.X, T2.W, literal.x,
1971; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1972; EG-NEXT:     LSHR T2.X, T1.W, literal.x,
1973; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
1974; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1975; EG-NEXT:     LSHR * T9.X, T0.W, literal.x,
1976; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1977;
1978; CM-LABEL: v5i16_arg:
1979; CM:       ; %bb.0: ; %entry
1980; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
1981; CM-NEXT:    TEX 4 @10
1982; CM-NEXT:    ALU 67, @21, KC0[CB0:0-32], KC1[]
1983; CM-NEXT:    MEM_RAT MSKOR T5.XW, T9.X
1984; CM-NEXT:    MEM_RAT MSKOR T4.XW, T7.X
1985; CM-NEXT:    MEM_RAT MSKOR T3.XW, T2.X
1986; CM-NEXT:    MEM_RAT MSKOR T6.XW, T1.X
1987; CM-NEXT:    MEM_RAT MSKOR T8.XW, T0.X
1988; CM-NEXT:    CF_END
1989; CM-NEXT:    PAD
1990; CM-NEXT:    Fetch clause starting at 10:
1991; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 58, #3
1992; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 56, #3
1993; CM-NEXT:     VTX_READ_16 T3.X, T0.X, 54, #3
1994; CM-NEXT:     VTX_READ_16 T4.X, T0.X, 52, #3
1995; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 60, #3
1996; CM-NEXT:    ALU clause starting at 20:
1997; CM-NEXT:     MOV * T0.X, 0.0,
1998; CM-NEXT:    ALU clause starting at 21:
1999; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
2000; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2001; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
2002; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2003; CM-NEXT:     AND_INT T0.Z, T0.X, literal.x,
2004; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
2005; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2006; CM-NEXT:     LSHL T5.X, PV.Z, PV.W,
2007; CM-NEXT:     LSHL * T5.W, literal.x, PV.W,
2008; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2009; CM-NEXT:     MOV T5.Y, 0.0,
2010; CM-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
2011; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2012; CM-NEXT:     AND_INT T0.Z, T4.X, literal.x,
2013; CM-NEXT:     LSHL * T1.W, PV.W, literal.y,
2014; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2015; CM-NEXT:     LSHL T4.X, PV.Z, PV.W,
2016; CM-NEXT:     LSHL * T4.W, literal.x, PV.W,
2017; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2018; CM-NEXT:     MOV T4.Y, 0.0,
2019; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2020; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2021; CM-NEXT:     AND_INT * T2.W, PV.W, literal.x,
2022; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2023; CM-NEXT:     AND_INT T0.Z, T3.X, literal.x,
2024; CM-NEXT:     LSHL * T2.W, PV.W, literal.y,
2025; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2026; CM-NEXT:     LSHL T3.X, PV.Z, PV.W,
2027; CM-NEXT:     LSHL * T3.W, literal.x, PV.W,
2028; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2029; CM-NEXT:     MOV T3.Y, 0.0,
2030; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2031; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
2032; CM-NEXT:     AND_INT * T6.W, PV.W, literal.x,
2033; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2034; CM-NEXT:     AND_INT T0.Z, T2.X, literal.x,
2035; CM-NEXT:     LSHL * T6.W, PV.W, literal.y,
2036; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2037; CM-NEXT:     LSHL T6.X, PV.Z, PV.W,
2038; CM-NEXT:     LSHL * T6.W, literal.x, PV.W,
2039; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2040; CM-NEXT:     MOV T6.Y, 0.0,
2041; CM-NEXT:     ADD_INT * T7.W, KC0[2].Y, literal.x,
2042; CM-NEXT:    6(8.407791e-45), 0(0.000000e+00)
2043; CM-NEXT:     AND_INT * T8.W, PV.W, literal.x,
2044; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2045; CM-NEXT:     AND_INT T0.Z, T1.X, literal.x,
2046; CM-NEXT:     LSHL * T8.W, PV.W, literal.y,
2047; CM-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2048; CM-NEXT:     LSHL T8.X, PV.Z, PV.W,
2049; CM-NEXT:     LSHL * T8.W, literal.x, PV.W,
2050; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2051; CM-NEXT:     MOV T8.Y, 0.0,
2052; CM-NEXT:     MOV * T5.Z, 0.0,
2053; CM-NEXT:     MOV * T4.Z, 0.0,
2054; CM-NEXT:     MOV * T3.Z, 0.0,
2055; CM-NEXT:     MOV * T6.Z, 0.0,
2056; CM-NEXT:     MOV * T8.Z, 0.0,
2057; CM-NEXT:     LSHR * T0.X, T7.W, literal.x,
2058; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2059; CM-NEXT:     LSHR * T1.X, T2.W, literal.x,
2060; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2061; CM-NEXT:     LSHR * T2.X, T1.W, literal.x,
2062; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2063; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
2064; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2065; CM-NEXT:     LSHR * T9.X, T0.W, literal.x,
2066; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2067entry:
2068  store <5 x i16> %in, <5 x i16> addrspace(1)* %out, align 4
2069  ret void
2070}
2071
2072define amdgpu_kernel void @v5i32_arg(<5 x i32> addrspace(1)* nocapture %out, <5 x i32> %in) nounwind {
2073; SI-LABEL: v5i32_arg:
2074; SI:       ; %bb.0: ; %entry
2075; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
2076; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2077; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
2078; SI-NEXT:    s_mov_b32 s7, 0xf000
2079; SI-NEXT:    s_mov_b32 s6, -1
2080; SI-NEXT:    s_waitcnt lgkmcnt(0)
2081; SI-NEXT:    v_mov_b32_e32 v0, s8
2082; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:16
2083; SI-NEXT:    s_waitcnt expcnt(0)
2084; SI-NEXT:    v_mov_b32_e32 v0, s0
2085; SI-NEXT:    v_mov_b32_e32 v1, s1
2086; SI-NEXT:    v_mov_b32_e32 v2, s2
2087; SI-NEXT:    v_mov_b32_e32 v3, s3
2088; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2089; SI-NEXT:    s_endpgm
2090;
2091; VI-LABEL: v5i32_arg:
2092; VI:       ; %bb.0: ; %entry
2093; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2094; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
2095; VI-NEXT:    s_load_dword s1, s[0:1], 0x54
2096; VI-NEXT:    s_waitcnt lgkmcnt(0)
2097; VI-NEXT:    s_add_u32 s0, s2, 16
2098; VI-NEXT:    v_mov_b32_e32 v5, s3
2099; VI-NEXT:    v_mov_b32_e32 v2, s1
2100; VI-NEXT:    s_addc_u32 s1, s3, 0
2101; VI-NEXT:    v_mov_b32_e32 v0, s0
2102; VI-NEXT:    v_mov_b32_e32 v1, s1
2103; VI-NEXT:    flat_store_dword v[0:1], v2
2104; VI-NEXT:    v_mov_b32_e32 v0, s4
2105; VI-NEXT:    v_mov_b32_e32 v4, s2
2106; VI-NEXT:    v_mov_b32_e32 v1, s5
2107; VI-NEXT:    v_mov_b32_e32 v2, s6
2108; VI-NEXT:    v_mov_b32_e32 v3, s7
2109; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2110; VI-NEXT:    s_endpgm
2111;
2112; GFX9-LABEL: v5i32_arg:
2113; GFX9:       ; %bb.0: ; %entry
2114; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2115; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2116; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x30
2117; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2118; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2120; GFX9-NEXT:    v_mov_b32_e32 v5, s8
2121; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2122; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2123; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2124; GFX9-NEXT:    global_store_dword v4, v5, s[6:7] offset:16
2125; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2126; GFX9-NEXT:    s_endpgm
2127;
2128; EG-LABEL: v5i32_arg:
2129; EG:       ; %bb.0: ; %entry
2130; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2131; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2132; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2133; EG-NEXT:    CF_END
2134; EG-NEXT:    ALU clause starting at 4:
2135; EG-NEXT:     MOV * T0.W, KC0[5].X,
2136; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2137; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
2138; EG-NEXT:     MOV T0.X, KC0[4].Y,
2139; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2140; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2141; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2142; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2143; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2144; EG-NEXT:     MOV * T3.X, KC0[5].Y,
2145; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2146;
2147; CM-LABEL: v5i32_arg:
2148; CM:       ; %bb.0: ; %entry
2149; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2150; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2151; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2152; CM-NEXT:    CF_END
2153; CM-NEXT:    ALU clause starting at 4:
2154; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2155; CM-NEXT:     MOV * T0.W, KC0[5].X,
2156; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2157; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
2158; CM-NEXT:     MOV * T0.Z, KC0[4].W,
2159; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2160; CM-NEXT:     MOV T2.X, KC0[5].Y,
2161; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
2162; CM-NEXT:     MOV * T0.X, KC0[4].Y,
2163; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2164; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2165entry:
2166  store <5 x i32> %in, <5 x i32> addrspace(1)* %out, align 4
2167  ret void
2168}
2169
2170define amdgpu_kernel void @v5f32_arg(<5 x float> addrspace(1)* nocapture %out, <5 x float> %in) nounwind {
2171; SI-LABEL: v5f32_arg:
2172; SI:       ; %bb.0: ; %entry
2173; SI-NEXT:    s_load_dword s8, s[0:1], 0x15
2174; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
2175; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x11
2176; SI-NEXT:    s_mov_b32 s7, 0xf000
2177; SI-NEXT:    s_mov_b32 s6, -1
2178; SI-NEXT:    s_waitcnt lgkmcnt(0)
2179; SI-NEXT:    v_mov_b32_e32 v0, s8
2180; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:16
2181; SI-NEXT:    s_waitcnt expcnt(0)
2182; SI-NEXT:    v_mov_b32_e32 v0, s0
2183; SI-NEXT:    v_mov_b32_e32 v1, s1
2184; SI-NEXT:    v_mov_b32_e32 v2, s2
2185; SI-NEXT:    v_mov_b32_e32 v3, s3
2186; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
2187; SI-NEXT:    s_endpgm
2188;
2189; VI-LABEL: v5f32_arg:
2190; VI:       ; %bb.0: ; %entry
2191; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2192; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x44
2193; VI-NEXT:    s_load_dword s1, s[0:1], 0x54
2194; VI-NEXT:    s_waitcnt lgkmcnt(0)
2195; VI-NEXT:    s_add_u32 s0, s2, 16
2196; VI-NEXT:    v_mov_b32_e32 v5, s3
2197; VI-NEXT:    v_mov_b32_e32 v3, s1
2198; VI-NEXT:    s_addc_u32 s1, s3, 0
2199; VI-NEXT:    v_mov_b32_e32 v2, s1
2200; VI-NEXT:    v_mov_b32_e32 v1, s0
2201; VI-NEXT:    flat_store_dword v[1:2], v3
2202; VI-NEXT:    v_mov_b32_e32 v0, s4
2203; VI-NEXT:    v_mov_b32_e32 v1, s5
2204; VI-NEXT:    v_mov_b32_e32 v2, s6
2205; VI-NEXT:    v_mov_b32_e32 v3, s7
2206; VI-NEXT:    v_mov_b32_e32 v4, s2
2207; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2208; VI-NEXT:    s_endpgm
2209;
2210; GFX9-LABEL: v5f32_arg:
2211; GFX9:       ; %bb.0: ; %entry
2212; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2213; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
2214; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x30
2215; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2216; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2217; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2218; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2219; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2220; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2221; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2222; GFX9-NEXT:    s_nop 0
2223; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2224; GFX9-NEXT:    global_store_dword v4, v0, s[6:7] offset:16
2225; GFX9-NEXT:    s_endpgm
2226;
2227; EG-LABEL: v5f32_arg:
2228; EG:       ; %bb.0: ; %entry
2229; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2230; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
2231; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
2232; EG-NEXT:    CF_END
2233; EG-NEXT:    ALU clause starting at 4:
2234; EG-NEXT:     MOV * T0.W, KC0[5].X,
2235; EG-NEXT:     MOV * T0.Z, KC0[4].W,
2236; EG-NEXT:     MOV * T0.Y, KC0[4].Z,
2237; EG-NEXT:     MOV T0.X, KC0[4].Y,
2238; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2239; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2240; EG-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.x,
2241; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2242; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
2243; EG-NEXT:     MOV * T3.X, KC0[5].Y,
2244; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2245;
2246; CM-LABEL: v5f32_arg:
2247; CM:       ; %bb.0: ; %entry
2248; CM-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
2249; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
2250; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
2251; CM-NEXT:    CF_END
2252; CM-NEXT:    ALU clause starting at 4:
2253; CM-NEXT:     ADD_INT T0.Z, KC0[2].Y, literal.x,
2254; CM-NEXT:     MOV * T0.W, KC0[5].X,
2255; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2256; CM-NEXT:     LSHR T1.X, PV.Z, literal.x,
2257; CM-NEXT:     MOV * T0.Z, KC0[4].W,
2258; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2259; CM-NEXT:     MOV T2.X, KC0[5].Y,
2260; CM-NEXT:     MOV * T0.Y, KC0[4].Z,
2261; CM-NEXT:     MOV * T0.X, KC0[4].Y,
2262; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2263; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2264entry:
2265  store <5 x float> %in, <5 x float> addrspace(1)* %out, align 4
2266  ret void
2267}
2268
2269define amdgpu_kernel void @v5i64_arg(<5 x i64> addrspace(1)* nocapture %out, <5 x i64> %in) nounwind {
2270; SI-LABEL: v5i64_arg:
2271; SI:       ; %bb.0: ; %entry
2272; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
2273; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
2274; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
2275; SI-NEXT:    s_mov_b32 s15, 0xf000
2276; SI-NEXT:    s_mov_b32 s14, -1
2277; SI-NEXT:    s_waitcnt lgkmcnt(0)
2278; SI-NEXT:    v_mov_b32_e32 v0, s8
2279; SI-NEXT:    v_mov_b32_e32 v1, s9
2280; SI-NEXT:    v_mov_b32_e32 v2, s10
2281; SI-NEXT:    v_mov_b32_e32 v3, s11
2282; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2283; SI-NEXT:    s_waitcnt expcnt(0)
2284; SI-NEXT:    v_mov_b32_e32 v0, s4
2285; SI-NEXT:    v_mov_b32_e32 v1, s5
2286; SI-NEXT:    v_mov_b32_e32 v2, s6
2287; SI-NEXT:    v_mov_b32_e32 v3, s7
2288; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2289; SI-NEXT:    s_waitcnt expcnt(0)
2290; SI-NEXT:    v_mov_b32_e32 v0, s0
2291; SI-NEXT:    v_mov_b32_e32 v1, s1
2292; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2293; SI-NEXT:    s_endpgm
2294;
2295; VI-LABEL: v5i64_arg:
2296; VI:       ; %bb.0: ; %entry
2297; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2298; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
2299; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
2300; VI-NEXT:    s_waitcnt lgkmcnt(0)
2301; VI-NEXT:    v_mov_b32_e32 v0, s8
2302; VI-NEXT:    s_add_u32 s8, s2, 16
2303; VI-NEXT:    v_mov_b32_e32 v1, s9
2304; VI-NEXT:    s_addc_u32 s9, s3, 0
2305; VI-NEXT:    v_mov_b32_e32 v4, s8
2306; VI-NEXT:    v_mov_b32_e32 v2, s10
2307; VI-NEXT:    v_mov_b32_e32 v3, s11
2308; VI-NEXT:    v_mov_b32_e32 v5, s9
2309; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2310; VI-NEXT:    v_mov_b32_e32 v5, s3
2311; VI-NEXT:    v_mov_b32_e32 v4, s2
2312; VI-NEXT:    s_add_u32 s2, s2, 32
2313; VI-NEXT:    v_mov_b32_e32 v0, s4
2314; VI-NEXT:    v_mov_b32_e32 v1, s5
2315; VI-NEXT:    v_mov_b32_e32 v2, s6
2316; VI-NEXT:    v_mov_b32_e32 v3, s7
2317; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2318; VI-NEXT:    s_addc_u32 s3, s3, 0
2319; VI-NEXT:    v_mov_b32_e32 v2, s2
2320; VI-NEXT:    v_mov_b32_e32 v0, s0
2321; VI-NEXT:    v_mov_b32_e32 v1, s1
2322; VI-NEXT:    v_mov_b32_e32 v3, s3
2323; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2324; VI-NEXT:    s_endpgm
2325;
2326; GFX9-LABEL: v5i64_arg:
2327; GFX9:       ; %bb.0: ; %entry
2328; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
2329; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2330; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
2331; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2332; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2333; GFX9-NEXT:    v_mov_b32_e32 v0, s12
2334; GFX9-NEXT:    v_mov_b32_e32 v1, s13
2335; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2336; GFX9-NEXT:    v_mov_b32_e32 v3, s15
2337; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
2338; GFX9-NEXT:    s_nop 0
2339; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2340; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2341; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2342; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2343; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2344; GFX9-NEXT:    s_nop 0
2345; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2346; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2347; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
2348; GFX9-NEXT:    s_endpgm
2349;
2350; EG-LABEL: v5i64_arg:
2351; EG:       ; %bb.0: ; %entry
2352; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2353; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2354; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2355; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2356; EG-NEXT:    CF_END
2357; EG-NEXT:    PAD
2358; EG-NEXT:    ALU clause starting at 6:
2359; EG-NEXT:     MOV * T0.W, KC0[7].X,
2360; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2361; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2362; EG-NEXT:     MOV * T1.W, KC0[8].X,
2363; EG-NEXT:     MOV T0.X, KC0[6].Y,
2364; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2365; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2366; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2367; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2368; EG-NEXT:     MOV T1.X, KC0[7].Y,
2369; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2370; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2371; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2372; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2373; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2374; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2375; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2376; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2377; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2378;
2379; CM-LABEL: v5i64_arg:
2380; CM:       ; %bb.0: ; %entry
2381; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2382; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2383; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2384; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2385; CM-NEXT:    CF_END
2386; CM-NEXT:    PAD
2387; CM-NEXT:    ALU clause starting at 6:
2388; CM-NEXT:     MOV * T0.W, KC0[8].X,
2389; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2390; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2391; CM-NEXT:     MOV T1.X, KC0[8].Y,
2392; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2393; CM-NEXT:     MOV T0.X, KC0[7].Y,
2394; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2395; CM-NEXT:     MOV * T2.W, KC0[7].X,
2396; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2397; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2398; CM-NEXT:     MOV T2.Z, KC0[6].W,
2399; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2400; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2401; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2402; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2403; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2404; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2405; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2406; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2407entry:
2408  store <5 x i64> %in, <5 x i64> addrspace(1)* %out, align 8
2409  ret void
2410}
2411
2412define amdgpu_kernel void @v5f64_arg(<5 x double> addrspace(1)* nocapture %out, <5 x double> %in) nounwind {
2413; SI-LABEL: v5f64_arg:
2414; SI:       ; %bb.0: ; %entry
2415; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x19
2416; SI-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x9
2417; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x21
2418; SI-NEXT:    s_mov_b32 s15, 0xf000
2419; SI-NEXT:    s_mov_b32 s14, -1
2420; SI-NEXT:    s_waitcnt lgkmcnt(0)
2421; SI-NEXT:    v_mov_b32_e32 v0, s8
2422; SI-NEXT:    v_mov_b32_e32 v1, s9
2423; SI-NEXT:    v_mov_b32_e32 v2, s10
2424; SI-NEXT:    v_mov_b32_e32 v3, s11
2425; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0 offset:16
2426; SI-NEXT:    s_waitcnt expcnt(0)
2427; SI-NEXT:    v_mov_b32_e32 v0, s4
2428; SI-NEXT:    v_mov_b32_e32 v1, s5
2429; SI-NEXT:    v_mov_b32_e32 v2, s6
2430; SI-NEXT:    v_mov_b32_e32 v3, s7
2431; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
2432; SI-NEXT:    s_waitcnt expcnt(0)
2433; SI-NEXT:    v_mov_b32_e32 v0, s0
2434; SI-NEXT:    v_mov_b32_e32 v1, s1
2435; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0 offset:32
2436; SI-NEXT:    s_endpgm
2437;
2438; VI-LABEL: v5f64_arg:
2439; VI:       ; %bb.0: ; %entry
2440; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2441; VI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x64
2442; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x84
2443; VI-NEXT:    s_waitcnt lgkmcnt(0)
2444; VI-NEXT:    v_mov_b32_e32 v0, s8
2445; VI-NEXT:    s_add_u32 s8, s2, 16
2446; VI-NEXT:    v_mov_b32_e32 v1, s9
2447; VI-NEXT:    s_addc_u32 s9, s3, 0
2448; VI-NEXT:    v_mov_b32_e32 v4, s8
2449; VI-NEXT:    v_mov_b32_e32 v2, s10
2450; VI-NEXT:    v_mov_b32_e32 v3, s11
2451; VI-NEXT:    v_mov_b32_e32 v5, s9
2452; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2453; VI-NEXT:    v_mov_b32_e32 v5, s3
2454; VI-NEXT:    v_mov_b32_e32 v4, s2
2455; VI-NEXT:    s_add_u32 s2, s2, 32
2456; VI-NEXT:    v_mov_b32_e32 v0, s4
2457; VI-NEXT:    v_mov_b32_e32 v1, s5
2458; VI-NEXT:    v_mov_b32_e32 v2, s6
2459; VI-NEXT:    v_mov_b32_e32 v3, s7
2460; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2461; VI-NEXT:    s_addc_u32 s3, s3, 0
2462; VI-NEXT:    v_mov_b32_e32 v2, s2
2463; VI-NEXT:    v_mov_b32_e32 v0, s0
2464; VI-NEXT:    v_mov_b32_e32 v1, s1
2465; VI-NEXT:    v_mov_b32_e32 v3, s3
2466; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
2467; VI-NEXT:    s_endpgm
2468;
2469; GFX9-LABEL: v5f64_arg:
2470; GFX9:       ; %bb.0: ; %entry
2471; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x40
2472; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2473; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x60
2474; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2475; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2476; GFX9-NEXT:    v_mov_b32_e32 v0, s12
2477; GFX9-NEXT:    v_mov_b32_e32 v1, s13
2478; GFX9-NEXT:    v_mov_b32_e32 v2, s14
2479; GFX9-NEXT:    v_mov_b32_e32 v3, s15
2480; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
2481; GFX9-NEXT:    s_nop 0
2482; GFX9-NEXT:    v_mov_b32_e32 v0, s8
2483; GFX9-NEXT:    v_mov_b32_e32 v1, s9
2484; GFX9-NEXT:    v_mov_b32_e32 v2, s10
2485; GFX9-NEXT:    v_mov_b32_e32 v3, s11
2486; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2487; GFX9-NEXT:    s_nop 0
2488; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2489; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2490; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[6:7] offset:32
2491; GFX9-NEXT:    s_endpgm
2492;
2493; EG-LABEL: v5f64_arg:
2494; EG:       ; %bb.0: ; %entry
2495; EG-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2496; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 0
2497; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
2498; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
2499; EG-NEXT:    CF_END
2500; EG-NEXT:    PAD
2501; EG-NEXT:    ALU clause starting at 6:
2502; EG-NEXT:     MOV * T0.W, KC0[7].X,
2503; EG-NEXT:     MOV * T0.Z, KC0[6].W,
2504; EG-NEXT:     MOV T0.Y, KC0[6].Z,
2505; EG-NEXT:     MOV * T1.W, KC0[8].X,
2506; EG-NEXT:     MOV T0.X, KC0[6].Y,
2507; EG-NEXT:     MOV * T1.Z, KC0[7].W,
2508; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
2509; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
2510; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2511; EG-NEXT:     MOV T1.X, KC0[7].Y,
2512; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2513; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2514; EG-NEXT:     LSHR T3.X, PV.W, literal.x,
2515; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
2516; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
2517; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
2518; EG-NEXT:     MOV T5.Y, KC0[8].Z,
2519; EG-NEXT:     MOV * T5.X, KC0[8].Y,
2520; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2521;
2522; CM-LABEL: v5f64_arg:
2523; CM:       ; %bb.0: ; %entry
2524; CM-NEXT:    ALU 18, @6, KC0[CB0:0-32], KC1[]
2525; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
2526; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T4.X
2527; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
2528; CM-NEXT:    CF_END
2529; CM-NEXT:    PAD
2530; CM-NEXT:    ALU clause starting at 6:
2531; CM-NEXT:     MOV * T0.W, KC0[8].X,
2532; CM-NEXT:     MOV T1.Y, KC0[8].Z,
2533; CM-NEXT:     MOV * T0.Z, KC0[7].W,
2534; CM-NEXT:     MOV T1.X, KC0[8].Y,
2535; CM-NEXT:     MOV * T0.Y, KC0[7].Z,
2536; CM-NEXT:     MOV T0.X, KC0[7].Y,
2537; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
2538; CM-NEXT:     MOV * T2.W, KC0[7].X,
2539; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
2540; CM-NEXT:     LSHR T3.X, PV.Z, literal.x,
2541; CM-NEXT:     MOV T2.Z, KC0[6].W,
2542; CM-NEXT:     ADD_INT * T1.W, KC0[2].Y, literal.y,
2543; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
2544; CM-NEXT:     LSHR T4.X, PV.W, literal.x,
2545; CM-NEXT:     MOV * T2.Y, KC0[6].Z,
2546; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2547; CM-NEXT:     MOV * T2.X, KC0[6].Y,
2548; CM-NEXT:     LSHR * T5.X, KC0[2].Y, literal.x,
2549; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2550entry:
2551  store <5 x double> %in, <5 x double> addrspace(1)* %out, align 8
2552  ret void
2553}
2554
2555; FIXME: Lots of unpack and re-pack junk on VI
2556define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
2557; SI-LABEL: v8i8_arg:
2558; SI:       ; %bb.0: ; %entry
2559; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
2560; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2561; SI-NEXT:    s_mov_b32 s3, 0xf000
2562; SI-NEXT:    s_mov_b32 s2, -1
2563; SI-NEXT:    s_waitcnt lgkmcnt(0)
2564; SI-NEXT:    v_mov_b32_e32 v0, s4
2565; SI-NEXT:    v_mov_b32_e32 v1, s5
2566; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2567; SI-NEXT:    s_endpgm
2568;
2569; VI-LABEL: v8i8_arg:
2570; VI:       ; %bb.0: ; %entry
2571; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
2572; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x2c
2573; VI-NEXT:    s_waitcnt lgkmcnt(0)
2574; VI-NEXT:    v_mov_b32_e32 v0, s2
2575; VI-NEXT:    v_mov_b32_e32 v3, s1
2576; VI-NEXT:    v_mov_b32_e32 v1, s3
2577; VI-NEXT:    v_mov_b32_e32 v2, s0
2578; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
2579; VI-NEXT:    s_endpgm
2580;
2581; GFX9-LABEL: v8i8_arg:
2582; GFX9:       ; %bb.0: ; %entry
2583; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
2584; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
2585; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2586; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2587; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2588; GFX9-NEXT:    v_mov_b32_e32 v1, s3
2589; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
2590; GFX9-NEXT:    s_endpgm
2591;
2592; EG-LABEL: v8i8_arg:
2593; EG:       ; %bb.0: ; %entry
2594; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2595; EG-NEXT:    TEX 0 @20
2596; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2597; EG-NEXT:    TEX 0 @22
2598; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2599; EG-NEXT:    TEX 0 @24
2600; EG-NEXT:    ALU 7, @50, KC0[], KC1[]
2601; EG-NEXT:    TEX 0 @26
2602; EG-NEXT:    ALU 7, @58, KC0[], KC1[]
2603; EG-NEXT:    TEX 0 @28
2604; EG-NEXT:    ALU 7, @66, KC0[], KC1[]
2605; EG-NEXT:    TEX 0 @30
2606; EG-NEXT:    ALU 7, @74, KC0[], KC1[]
2607; EG-NEXT:    TEX 0 @32
2608; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
2609; EG-NEXT:    TEX 0 @34
2610; EG-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2611; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
2612; EG-NEXT:    CF_END
2613; EG-NEXT:    PAD
2614; EG-NEXT:    Fetch clause starting at 20:
2615; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2616; EG-NEXT:    Fetch clause starting at 22:
2617; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2618; EG-NEXT:    Fetch clause starting at 24:
2619; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2620; EG-NEXT:    Fetch clause starting at 26:
2621; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2622; EG-NEXT:    Fetch clause starting at 28:
2623; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2624; EG-NEXT:    Fetch clause starting at 30:
2625; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2626; EG-NEXT:    Fetch clause starting at 32:
2627; EG-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2628; EG-NEXT:    Fetch clause starting at 34:
2629; EG-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2630; EG-NEXT:    ALU clause starting at 36:
2631; EG-NEXT:     MOV * T0.Y, T2.X,
2632; EG-NEXT:     MOV * T5.X, 0.0,
2633; EG-NEXT:    ALU clause starting at 38:
2634; EG-NEXT:     LSHL T0.W, T6.X, literal.x,
2635; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2636; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2637; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2638; EG-NEXT:     MOV T2.X, PV.W,
2639; EG-NEXT:     MOV * T0.Y, T3.X,
2640; EG-NEXT:    ALU clause starting at 44:
2641; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2642; EG-NEXT:     LSHL * T1.W, T6.X, literal.y,
2643; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2644; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2645; EG-NEXT:     MOV T3.X, PV.W,
2646; EG-NEXT:     MOV * T0.Y, T2.X,
2647; EG-NEXT:    ALU clause starting at 50:
2648; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2649; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2650; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2651; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2652; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2653; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2654; EG-NEXT:     MOV T2.X, PV.W,
2655; EG-NEXT:     MOV * T0.Y, T3.X,
2656; EG-NEXT:    ALU clause starting at 58:
2657; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2658; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2659; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
2660; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2661; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
2662; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2663; EG-NEXT:     MOV T3.X, PV.W,
2664; EG-NEXT:     MOV * T0.Y, T2.X,
2665; EG-NEXT:    ALU clause starting at 66:
2666; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2667; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2668; EG-NEXT:    255(3.573311e-43), -65281(nan)
2669; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2670; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2671; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2672; EG-NEXT:     MOV T2.X, PV.W,
2673; EG-NEXT:     MOV * T0.Y, T3.X,
2674; EG-NEXT:    ALU clause starting at 74:
2675; EG-NEXT:     AND_INT T0.W, T6.X, literal.x,
2676; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2677; EG-NEXT:    255(3.573311e-43), -65281(nan)
2678; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
2679; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
2680; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
2681; EG-NEXT:     MOV T3.X, PV.W,
2682; EG-NEXT:     MOV * T0.Y, T2.X,
2683; EG-NEXT:    ALU clause starting at 82:
2684; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2685; EG-NEXT:     AND_INT * T1.W, T6.X, literal.y,
2686; EG-NEXT:    -256(nan), 255(3.573311e-43)
2687; EG-NEXT:     OR_INT * T5.Y, PV.W, PS,
2688; EG-NEXT:     MOV T2.X, PV.Y,
2689; EG-NEXT:     MOV * T0.Y, T3.X,
2690; EG-NEXT:    ALU clause starting at 88:
2691; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2692; EG-NEXT:     AND_INT * T1.W, T5.X, literal.y,
2693; EG-NEXT:    -256(nan), 255(3.573311e-43)
2694; EG-NEXT:     OR_INT T5.X, PV.W, PS,
2695; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2696; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2697;
2698; CM-LABEL: v8i8_arg:
2699; CM:       ; %bb.0: ; %entry
2700; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2701; CM-NEXT:    TEX 0 @20
2702; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2703; CM-NEXT:    TEX 0 @22
2704; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2705; CM-NEXT:    TEX 0 @24
2706; CM-NEXT:    ALU 7, @50, KC0[], KC1[]
2707; CM-NEXT:    TEX 0 @26
2708; CM-NEXT:    ALU 7, @58, KC0[], KC1[]
2709; CM-NEXT:    TEX 0 @28
2710; CM-NEXT:    ALU 7, @66, KC0[], KC1[]
2711; CM-NEXT:    TEX 0 @30
2712; CM-NEXT:    ALU 7, @74, KC0[], KC1[]
2713; CM-NEXT:    TEX 0 @32
2714; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
2715; CM-NEXT:    TEX 0 @34
2716; CM-NEXT:    ALU 5, @88, KC0[CB0:0-32], KC1[]
2717; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
2718; CM-NEXT:    CF_END
2719; CM-NEXT:    PAD
2720; CM-NEXT:    Fetch clause starting at 20:
2721; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 51, #3
2722; CM-NEXT:    Fetch clause starting at 22:
2723; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 47, #3
2724; CM-NEXT:    Fetch clause starting at 24:
2725; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 50, #3
2726; CM-NEXT:    Fetch clause starting at 26:
2727; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 46, #3
2728; CM-NEXT:    Fetch clause starting at 28:
2729; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 49, #3
2730; CM-NEXT:    Fetch clause starting at 30:
2731; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 45, #3
2732; CM-NEXT:    Fetch clause starting at 32:
2733; CM-NEXT:     VTX_READ_8 T6.X, T5.X, 48, #3
2734; CM-NEXT:    Fetch clause starting at 34:
2735; CM-NEXT:     VTX_READ_8 T5.X, T5.X, 44, #3
2736; CM-NEXT:    ALU clause starting at 36:
2737; CM-NEXT:     MOV * T0.Y, T2.X,
2738; CM-NEXT:     MOV * T5.X, 0.0,
2739; CM-NEXT:    ALU clause starting at 38:
2740; CM-NEXT:     LSHL T0.Z, T6.X, literal.x,
2741; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2742; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
2743; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2744; CM-NEXT:     MOV T2.X, PV.W,
2745; CM-NEXT:     MOV * T0.Y, T3.X,
2746; CM-NEXT:    ALU clause starting at 44:
2747; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2748; CM-NEXT:     LSHL * T0.W, T6.X, literal.y,
2749; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
2750; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2751; CM-NEXT:     MOV T3.X, PV.W,
2752; CM-NEXT:     MOV * T0.Y, T2.X,
2753; CM-NEXT:    ALU clause starting at 50:
2754; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2755; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2756; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2757; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2758; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2759; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2760; CM-NEXT:     MOV T2.X, PV.W,
2761; CM-NEXT:     MOV * T0.Y, T3.X,
2762; CM-NEXT:    ALU clause starting at 58:
2763; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2764; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2765; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2766; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2767; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
2768; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2769; CM-NEXT:     MOV T3.X, PV.W,
2770; CM-NEXT:     MOV * T0.Y, T2.X,
2771; CM-NEXT:    ALU clause starting at 66:
2772; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2773; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2774; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2775; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2776; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2777; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2778; CM-NEXT:     MOV T2.X, PV.W,
2779; CM-NEXT:     MOV * T0.Y, T3.X,
2780; CM-NEXT:    ALU clause starting at 74:
2781; CM-NEXT:     AND_INT * T0.W, T6.X, literal.x,
2782; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2783; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2784; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
2785; CM-NEXT:    -65281(nan), 8(1.121039e-44)
2786; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
2787; CM-NEXT:     MOV T3.X, PV.W,
2788; CM-NEXT:     MOV * T0.Y, T2.X,
2789; CM-NEXT:    ALU clause starting at 82:
2790; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2791; CM-NEXT:     AND_INT * T0.W, T6.X, literal.y,
2792; CM-NEXT:    -256(nan), 255(3.573311e-43)
2793; CM-NEXT:     OR_INT * T5.Y, PV.Z, PV.W,
2794; CM-NEXT:     MOV T2.X, PV.Y,
2795; CM-NEXT:     MOV * T0.Y, T3.X,
2796; CM-NEXT:    ALU clause starting at 88:
2797; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
2798; CM-NEXT:     AND_INT * T0.W, T5.X, literal.y,
2799; CM-NEXT:    -256(nan), 255(3.573311e-43)
2800; CM-NEXT:     OR_INT * T5.X, PV.Z, PV.W,
2801; CM-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
2802; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2803entry:
2804  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
2805  ret void
2806}
2807
2808define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
2809; SI-LABEL: v8i16_arg:
2810; SI:       ; %bb.0: ; %entry
2811; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
2812; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
2813; SI-NEXT:    s_mov_b32 s3, 0xf000
2814; SI-NEXT:    s_mov_b32 s2, -1
2815; SI-NEXT:    s_waitcnt lgkmcnt(0)
2816; SI-NEXT:    v_mov_b32_e32 v0, s4
2817; SI-NEXT:    v_mov_b32_e32 v1, s5
2818; SI-NEXT:    v_mov_b32_e32 v2, s6
2819; SI-NEXT:    v_mov_b32_e32 v3, s7
2820; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2821; SI-NEXT:    s_endpgm
2822;
2823; VI-LABEL: v8i16_arg:
2824; VI:       ; %bb.0: ; %entry
2825; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
2826; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
2827; VI-NEXT:    s_waitcnt lgkmcnt(0)
2828; VI-NEXT:    v_mov_b32_e32 v4, s4
2829; VI-NEXT:    v_mov_b32_e32 v0, s0
2830; VI-NEXT:    v_mov_b32_e32 v5, s5
2831; VI-NEXT:    v_mov_b32_e32 v1, s1
2832; VI-NEXT:    v_mov_b32_e32 v2, s2
2833; VI-NEXT:    v_mov_b32_e32 v3, s3
2834; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
2835; VI-NEXT:    s_endpgm
2836;
2837; GFX9-LABEL: v8i16_arg:
2838; GFX9:       ; %bb.0: ; %entry
2839; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
2840; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
2841; GFX9-NEXT:    v_mov_b32_e32 v4, 0
2842; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2843; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2844; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2845; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2846; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2847; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
2848; GFX9-NEXT:    s_endpgm
2849;
2850; EG-LABEL: v8i16_arg:
2851; EG:       ; %bb.0: ; %entry
2852; EG-NEXT:    ALU 1, @36, KC0[], KC1[]
2853; EG-NEXT:    TEX 0 @20
2854; EG-NEXT:    ALU 5, @38, KC0[], KC1[]
2855; EG-NEXT:    TEX 0 @22
2856; EG-NEXT:    ALU 5, @44, KC0[], KC1[]
2857; EG-NEXT:    TEX 0 @24
2858; EG-NEXT:    ALU 5, @50, KC0[], KC1[]
2859; EG-NEXT:    TEX 0 @26
2860; EG-NEXT:    ALU 5, @56, KC0[], KC1[]
2861; EG-NEXT:    TEX 0 @28
2862; EG-NEXT:    ALU 5, @62, KC0[], KC1[]
2863; EG-NEXT:    TEX 0 @30
2864; EG-NEXT:    ALU 5, @68, KC0[], KC1[]
2865; EG-NEXT:    TEX 0 @32
2866; EG-NEXT:    ALU 5, @74, KC0[], KC1[]
2867; EG-NEXT:    TEX 0 @34
2868; EG-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2869; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
2870; EG-NEXT:    CF_END
2871; EG-NEXT:    PAD
2872; EG-NEXT:    Fetch clause starting at 20:
2873; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2874; EG-NEXT:    Fetch clause starting at 22:
2875; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2876; EG-NEXT:    Fetch clause starting at 24:
2877; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2878; EG-NEXT:    Fetch clause starting at 26:
2879; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2880; EG-NEXT:    Fetch clause starting at 28:
2881; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2882; EG-NEXT:    Fetch clause starting at 30:
2883; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2884; EG-NEXT:    Fetch clause starting at 32:
2885; EG-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2886; EG-NEXT:    Fetch clause starting at 34:
2887; EG-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2888; EG-NEXT:    ALU clause starting at 36:
2889; EG-NEXT:     MOV * T0.Y, T3.X,
2890; EG-NEXT:     MOV * T7.X, 0.0,
2891; EG-NEXT:    ALU clause starting at 38:
2892; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
2893; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
2894; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2895; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
2896; EG-NEXT:     MOV T3.X, PV.W,
2897; EG-NEXT:     MOV * T0.Y, T5.X,
2898; EG-NEXT:    ALU clause starting at 44:
2899; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2900; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2901; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2902; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2903; EG-NEXT:     MOV T5.X, PV.W,
2904; EG-NEXT:     MOV * T0.Y, T3.X,
2905; EG-NEXT:    ALU clause starting at 50:
2906; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2907; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2908; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2909; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2910; EG-NEXT:     MOV T3.X, PV.W,
2911; EG-NEXT:     MOV * T0.Y, T5.X,
2912; EG-NEXT:    ALU clause starting at 56:
2913; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2914; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2915; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2916; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2917; EG-NEXT:     MOV T5.X, PV.W,
2918; EG-NEXT:     MOV * T0.Y, T2.X,
2919; EG-NEXT:    ALU clause starting at 62:
2920; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2921; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2922; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2923; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2924; EG-NEXT:     MOV T2.X, PV.W,
2925; EG-NEXT:     MOV * T0.Y, T4.X,
2926; EG-NEXT:    ALU clause starting at 68:
2927; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2928; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
2929; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
2930; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
2931; EG-NEXT:     MOV T4.X, PV.W,
2932; EG-NEXT:     MOV * T0.Y, T2.X,
2933; EG-NEXT:    ALU clause starting at 74:
2934; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
2935; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
2936; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
2937; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
2938; EG-NEXT:     MOV T2.X, PV.Z,
2939; EG-NEXT:     MOV * T0.Y, T4.X,
2940; EG-NEXT:    ALU clause starting at 80:
2941; EG-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
2942; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
2943; EG-NEXT:     AND_INT * T1.W, T7.X, literal.z,
2944; EG-NEXT:    2(2.802597e-45), -65536(nan)
2945; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2946; EG-NEXT:     OR_INT * T7.X, PV.W, PS,
2947; EG-NEXT:     MOV T4.X, PV.X,
2948; EG-NEXT:     MOV * T7.W, T3.X,
2949; EG-NEXT:     MOV * T7.Y, T5.X,
2950;
2951; CM-LABEL: v8i16_arg:
2952; CM:       ; %bb.0: ; %entry
2953; CM-NEXT:    ALU 1, @36, KC0[], KC1[]
2954; CM-NEXT:    TEX 0 @20
2955; CM-NEXT:    ALU 5, @38, KC0[], KC1[]
2956; CM-NEXT:    TEX 0 @22
2957; CM-NEXT:    ALU 5, @44, KC0[], KC1[]
2958; CM-NEXT:    TEX 0 @24
2959; CM-NEXT:    ALU 5, @50, KC0[], KC1[]
2960; CM-NEXT:    TEX 0 @26
2961; CM-NEXT:    ALU 5, @56, KC0[], KC1[]
2962; CM-NEXT:    TEX 0 @28
2963; CM-NEXT:    ALU 5, @62, KC0[], KC1[]
2964; CM-NEXT:    TEX 0 @30
2965; CM-NEXT:    ALU 5, @68, KC0[], KC1[]
2966; CM-NEXT:    TEX 0 @32
2967; CM-NEXT:    ALU 5, @74, KC0[], KC1[]
2968; CM-NEXT:    TEX 0 @34
2969; CM-NEXT:    ALU 8, @80, KC0[CB0:0-32], KC1[]
2970; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
2971; CM-NEXT:    CF_END
2972; CM-NEXT:    PAD
2973; CM-NEXT:    Fetch clause starting at 20:
2974; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 66, #3
2975; CM-NEXT:    Fetch clause starting at 22:
2976; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 58, #3
2977; CM-NEXT:    Fetch clause starting at 24:
2978; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 64, #3
2979; CM-NEXT:    Fetch clause starting at 26:
2980; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 56, #3
2981; CM-NEXT:    Fetch clause starting at 28:
2982; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 62, #3
2983; CM-NEXT:    Fetch clause starting at 30:
2984; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 54, #3
2985; CM-NEXT:    Fetch clause starting at 32:
2986; CM-NEXT:     VTX_READ_16 T8.X, T7.X, 60, #3
2987; CM-NEXT:    Fetch clause starting at 34:
2988; CM-NEXT:     VTX_READ_16 T7.X, T7.X, 52, #3
2989; CM-NEXT:    ALU clause starting at 36:
2990; CM-NEXT:     MOV * T0.Y, T3.X,
2991; CM-NEXT:     MOV * T7.X, 0.0,
2992; CM-NEXT:    ALU clause starting at 38:
2993; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
2994; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
2995; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
2996; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
2997; CM-NEXT:     MOV T3.X, PV.W,
2998; CM-NEXT:     MOV * T0.Y, T5.X,
2999; CM-NEXT:    ALU clause starting at 44:
3000; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3001; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3002; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3003; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3004; CM-NEXT:     MOV T5.X, PV.W,
3005; CM-NEXT:     MOV * T0.Y, T3.X,
3006; CM-NEXT:    ALU clause starting at 50:
3007; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3008; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3009; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3010; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3011; CM-NEXT:     MOV T3.X, PV.W,
3012; CM-NEXT:     MOV * T0.Y, T5.X,
3013; CM-NEXT:    ALU clause starting at 56:
3014; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3015; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3016; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3017; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3018; CM-NEXT:     MOV T5.X, PV.W,
3019; CM-NEXT:     MOV * T0.Y, T2.X,
3020; CM-NEXT:    ALU clause starting at 62:
3021; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3022; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3023; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3024; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3025; CM-NEXT:     MOV T2.X, PV.W,
3026; CM-NEXT:     MOV * T0.Y, T4.X,
3027; CM-NEXT:    ALU clause starting at 68:
3028; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3029; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3030; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3031; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3032; CM-NEXT:     MOV T4.X, PV.W,
3033; CM-NEXT:     MOV * T0.Y, T2.X,
3034; CM-NEXT:    ALU clause starting at 74:
3035; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3036; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3037; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
3038; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
3039; CM-NEXT:     MOV T2.X, PV.Z,
3040; CM-NEXT:     MOV * T0.Y, T4.X,
3041; CM-NEXT:    ALU clause starting at 80:
3042; CM-NEXT:     LSHR T8.X, KC0[2].Y, literal.x,
3043; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
3044; CM-NEXT:     AND_INT * T0.W, T7.X, literal.z,
3045; CM-NEXT:    2(2.802597e-45), -65536(nan)
3046; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3047; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
3048; CM-NEXT:     MOV T4.X, PV.X,
3049; CM-NEXT:     MOV * T7.W, T3.X,
3050; CM-NEXT:     MOV * T7.Y, T5.X,
3051entry:
3052  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
3053  ret void
3054}
3055
3056define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
3057; SI-LABEL: v8i32_arg:
3058; SI:       ; %bb.0: ; %entry
3059; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3060; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3061; SI-NEXT:    s_mov_b32 s3, 0xf000
3062; SI-NEXT:    s_mov_b32 s2, -1
3063; SI-NEXT:    s_waitcnt lgkmcnt(0)
3064; SI-NEXT:    v_mov_b32_e32 v0, s8
3065; SI-NEXT:    v_mov_b32_e32 v1, s9
3066; SI-NEXT:    v_mov_b32_e32 v2, s10
3067; SI-NEXT:    v_mov_b32_e32 v3, s11
3068; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3069; SI-NEXT:    s_waitcnt expcnt(0)
3070; SI-NEXT:    v_mov_b32_e32 v0, s4
3071; SI-NEXT:    v_mov_b32_e32 v1, s5
3072; SI-NEXT:    v_mov_b32_e32 v2, s6
3073; SI-NEXT:    v_mov_b32_e32 v3, s7
3074; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3075; SI-NEXT:    s_endpgm
3076;
3077; VI-LABEL: v8i32_arg:
3078; VI:       ; %bb.0: ; %entry
3079; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
3080; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x44
3081; VI-NEXT:    s_waitcnt lgkmcnt(0)
3082; VI-NEXT:    v_mov_b32_e32 v0, s4
3083; VI-NEXT:    s_add_u32 s4, s8, 16
3084; VI-NEXT:    v_mov_b32_e32 v1, s5
3085; VI-NEXT:    s_addc_u32 s5, s9, 0
3086; VI-NEXT:    v_mov_b32_e32 v4, s4
3087; VI-NEXT:    v_mov_b32_e32 v2, s6
3088; VI-NEXT:    v_mov_b32_e32 v3, s7
3089; VI-NEXT:    v_mov_b32_e32 v5, s5
3090; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3091; VI-NEXT:    v_mov_b32_e32 v4, s8
3092; VI-NEXT:    v_mov_b32_e32 v0, s0
3093; VI-NEXT:    v_mov_b32_e32 v1, s1
3094; VI-NEXT:    v_mov_b32_e32 v2, s2
3095; VI-NEXT:    v_mov_b32_e32 v3, s3
3096; VI-NEXT:    v_mov_b32_e32 v5, s9
3097; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3098; VI-NEXT:    s_endpgm
3099;
3100; GFX9-LABEL: v8i32_arg:
3101; GFX9:       ; %bb.0: ; %entry
3102; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3103; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3104; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3105; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3106; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3107; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3108; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3109; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3110; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3111; GFX9-NEXT:    s_nop 0
3112; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3113; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3114; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3115; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3116; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3117; GFX9-NEXT:    s_endpgm
3118;
3119; EG-LABEL: v8i32_arg:
3120; EG:       ; %bb.0: ; %entry
3121; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3124; EG-NEXT:    CF_END
3125; EG-NEXT:    ALU clause starting at 4:
3126; EG-NEXT:     MOV * T0.W, KC0[5].X,
3127; EG-NEXT:     MOV * T0.Z, KC0[4].W,
3128; EG-NEXT:     MOV T0.Y, KC0[4].Z,
3129; EG-NEXT:     MOV * T1.W, KC0[6].X,
3130; EG-NEXT:     MOV T0.X, KC0[4].Y,
3131; EG-NEXT:     MOV * T1.Z, KC0[5].W,
3132; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
3133; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
3134; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3135; EG-NEXT:     MOV T1.X, KC0[5].Y,
3136; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3137; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3138; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
3139; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3140;
3141; CM-LABEL: v8i32_arg:
3142; CM:       ; %bb.0: ; %entry
3143; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3144; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3145; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3146; CM-NEXT:    CF_END
3147; CM-NEXT:    ALU clause starting at 4:
3148; CM-NEXT:     MOV * T0.W, KC0[6].X,
3149; CM-NEXT:     MOV * T0.Z, KC0[5].W,
3150; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
3151; CM-NEXT:     MOV T0.X, KC0[5].Y,
3152; CM-NEXT:     MOV * T1.W, KC0[5].X,
3153; CM-NEXT:     MOV T1.Z, KC0[4].W,
3154; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3155; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3156; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
3157; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
3158; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3159; CM-NEXT:     MOV * T1.X, KC0[4].Y,
3160; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
3161; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3162entry:
3163  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
3164  ret void
3165}
3166
3167define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
3168; SI-LABEL: v8f32_arg:
3169; SI:       ; %bb.0: ; %entry
3170; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3171; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3172; SI-NEXT:    s_mov_b32 s3, 0xf000
3173; SI-NEXT:    s_mov_b32 s2, -1
3174; SI-NEXT:    s_waitcnt lgkmcnt(0)
3175; SI-NEXT:    v_mov_b32_e32 v0, s8
3176; SI-NEXT:    v_mov_b32_e32 v1, s9
3177; SI-NEXT:    v_mov_b32_e32 v2, s10
3178; SI-NEXT:    v_mov_b32_e32 v3, s11
3179; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3180; SI-NEXT:    s_waitcnt expcnt(0)
3181; SI-NEXT:    v_mov_b32_e32 v0, s4
3182; SI-NEXT:    v_mov_b32_e32 v1, s5
3183; SI-NEXT:    v_mov_b32_e32 v2, s6
3184; SI-NEXT:    v_mov_b32_e32 v3, s7
3185; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3186; SI-NEXT:    s_endpgm
3187;
3188; VI-LABEL: v8f32_arg:
3189; VI:       ; %bb.0: ; %entry
3190; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
3191; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x44
3192; VI-NEXT:    s_waitcnt lgkmcnt(0)
3193; VI-NEXT:    v_mov_b32_e32 v0, s4
3194; VI-NEXT:    s_add_u32 s4, s8, 16
3195; VI-NEXT:    v_mov_b32_e32 v1, s5
3196; VI-NEXT:    s_addc_u32 s5, s9, 0
3197; VI-NEXT:    v_mov_b32_e32 v4, s4
3198; VI-NEXT:    v_mov_b32_e32 v2, s6
3199; VI-NEXT:    v_mov_b32_e32 v3, s7
3200; VI-NEXT:    v_mov_b32_e32 v5, s5
3201; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3202; VI-NEXT:    v_mov_b32_e32 v4, s8
3203; VI-NEXT:    v_mov_b32_e32 v0, s0
3204; VI-NEXT:    v_mov_b32_e32 v1, s1
3205; VI-NEXT:    v_mov_b32_e32 v2, s2
3206; VI-NEXT:    v_mov_b32_e32 v3, s3
3207; VI-NEXT:    v_mov_b32_e32 v5, s9
3208; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3209; VI-NEXT:    s_endpgm
3210;
3211; GFX9-LABEL: v8f32_arg:
3212; GFX9:       ; %bb.0: ; %entry
3213; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3214; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3215; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3216; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3217; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3218; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3219; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3220; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3221; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3222; GFX9-NEXT:    s_nop 0
3223; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3224; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3225; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3226; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3227; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3228; GFX9-NEXT:    s_endpgm
3229;
3230; EG-LABEL: v8f32_arg:
3231; EG:       ; %bb.0: ; %entry
3232; EG-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3233; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
3234; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
3235; EG-NEXT:    CF_END
3236; EG-NEXT:    ALU clause starting at 4:
3237; EG-NEXT:     MOV * T0.W, KC0[5].X,
3238; EG-NEXT:     MOV * T0.Z, KC0[4].W,
3239; EG-NEXT:     MOV T0.Y, KC0[4].Z,
3240; EG-NEXT:     MOV * T1.W, KC0[6].X,
3241; EG-NEXT:     MOV T0.X, KC0[4].Y,
3242; EG-NEXT:     MOV * T1.Z, KC0[5].W,
3243; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
3244; EG-NEXT:     MOV * T1.Y, KC0[5].Z,
3245; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3246; EG-NEXT:     MOV T1.X, KC0[5].Y,
3247; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3248; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3249; EG-NEXT:     LSHR * T3.X, PV.W, literal.x,
3250; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3251;
3252; CM-LABEL: v8f32_arg:
3253; CM:       ; %bb.0: ; %entry
3254; CM-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
3255; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T3.X
3256; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
3257; CM-NEXT:    CF_END
3258; CM-NEXT:    ALU clause starting at 4:
3259; CM-NEXT:     MOV * T0.W, KC0[6].X,
3260; CM-NEXT:     MOV * T0.Z, KC0[5].W,
3261; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
3262; CM-NEXT:     MOV T0.X, KC0[5].Y,
3263; CM-NEXT:     MOV * T1.W, KC0[5].X,
3264; CM-NEXT:     MOV T1.Z, KC0[4].W,
3265; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
3266; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3267; CM-NEXT:     LSHR T2.X, PV.W, literal.x,
3268; CM-NEXT:     MOV * T1.Y, KC0[4].Z,
3269; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3270; CM-NEXT:     MOV * T1.X, KC0[4].Y,
3271; CM-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
3272; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3273entry:
3274  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
3275  ret void
3276}
3277
3278; FIXME: Pack/repack on VI
3279define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
3280; SI-LABEL: v16i8_arg:
3281; SI:       ; %bb.0: ; %entry
3282; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
3283; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3284; SI-NEXT:    s_mov_b32 s3, 0xf000
3285; SI-NEXT:    s_mov_b32 s2, -1
3286; SI-NEXT:    s_waitcnt lgkmcnt(0)
3287; SI-NEXT:    v_mov_b32_e32 v0, s4
3288; SI-NEXT:    v_mov_b32_e32 v1, s5
3289; SI-NEXT:    v_mov_b32_e32 v2, s6
3290; SI-NEXT:    v_mov_b32_e32 v3, s7
3291; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3292; SI-NEXT:    s_endpgm
3293;
3294; VI-LABEL: v16i8_arg:
3295; VI:       ; %bb.0: ; %entry
3296; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
3297; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x34
3298; VI-NEXT:    s_waitcnt lgkmcnt(0)
3299; VI-NEXT:    v_mov_b32_e32 v4, s4
3300; VI-NEXT:    v_mov_b32_e32 v0, s0
3301; VI-NEXT:    v_mov_b32_e32 v5, s5
3302; VI-NEXT:    v_mov_b32_e32 v1, s1
3303; VI-NEXT:    v_mov_b32_e32 v2, s2
3304; VI-NEXT:    v_mov_b32_e32 v3, s3
3305; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3306; VI-NEXT:    s_endpgm
3307;
3308; GFX9-LABEL: v16i8_arg:
3309; GFX9:       ; %bb.0: ; %entry
3310; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
3311; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
3312; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3314; GFX9-NEXT:    v_mov_b32_e32 v0, s0
3315; GFX9-NEXT:    v_mov_b32_e32 v1, s1
3316; GFX9-NEXT:    v_mov_b32_e32 v2, s2
3317; GFX9-NEXT:    v_mov_b32_e32 v3, s3
3318; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
3319; GFX9-NEXT:    s_endpgm
3320;
3321; EG-LABEL: v16i8_arg:
3322; EG:       ; %bb.0: ; %entry
3323; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3324; EG-NEXT:    TEX 0 @36
3325; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3326; EG-NEXT:    TEX 0 @38
3327; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3328; EG-NEXT:    TEX 0 @40
3329; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3330; EG-NEXT:    TEX 0 @42
3331; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3332; EG-NEXT:    TEX 0 @44
3333; EG-NEXT:    ALU 7, @94, KC0[], KC1[]
3334; EG-NEXT:    TEX 0 @46
3335; EG-NEXT:    ALU 7, @102, KC0[], KC1[]
3336; EG-NEXT:    TEX 0 @48
3337; EG-NEXT:    ALU 7, @110, KC0[], KC1[]
3338; EG-NEXT:    TEX 0 @50
3339; EG-NEXT:    ALU 7, @118, KC0[], KC1[]
3340; EG-NEXT:    TEX 0 @52
3341; EG-NEXT:    ALU 7, @126, KC0[], KC1[]
3342; EG-NEXT:    TEX 0 @54
3343; EG-NEXT:    ALU 7, @134, KC0[], KC1[]
3344; EG-NEXT:    TEX 0 @56
3345; EG-NEXT:    ALU 7, @142, KC0[], KC1[]
3346; EG-NEXT:    TEX 0 @58
3347; EG-NEXT:    ALU 7, @150, KC0[], KC1[]
3348; EG-NEXT:    TEX 0 @60
3349; EG-NEXT:    ALU 5, @158, KC0[], KC1[]
3350; EG-NEXT:    TEX 0 @62
3351; EG-NEXT:    ALU 5, @164, KC0[], KC1[]
3352; EG-NEXT:    TEX 0 @64
3353; EG-NEXT:    ALU 5, @170, KC0[], KC1[]
3354; EG-NEXT:    TEX 0 @66
3355; EG-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3356; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 1
3357; EG-NEXT:    CF_END
3358; EG-NEXT:    PAD
3359; EG-NEXT:    Fetch clause starting at 36:
3360; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3361; EG-NEXT:    Fetch clause starting at 38:
3362; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3363; EG-NEXT:    Fetch clause starting at 40:
3364; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3365; EG-NEXT:    Fetch clause starting at 42:
3366; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3367; EG-NEXT:    Fetch clause starting at 44:
3368; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3369; EG-NEXT:    Fetch clause starting at 46:
3370; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3371; EG-NEXT:    Fetch clause starting at 48:
3372; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3373; EG-NEXT:    Fetch clause starting at 50:
3374; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3375; EG-NEXT:    Fetch clause starting at 52:
3376; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3377; EG-NEXT:    Fetch clause starting at 54:
3378; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3379; EG-NEXT:    Fetch clause starting at 56:
3380; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3381; EG-NEXT:    Fetch clause starting at 58:
3382; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3383; EG-NEXT:    Fetch clause starting at 60:
3384; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3385; EG-NEXT:    Fetch clause starting at 62:
3386; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3387; EG-NEXT:    Fetch clause starting at 64:
3388; EG-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3389; EG-NEXT:    Fetch clause starting at 66:
3390; EG-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3391; EG-NEXT:    ALU clause starting at 68:
3392; EG-NEXT:     MOV * T0.Y, T2.X,
3393; EG-NEXT:     MOV * T7.X, 0.0,
3394; EG-NEXT:    ALU clause starting at 70:
3395; EG-NEXT:     LSHL T0.W, T8.X, literal.x,
3396; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3397; EG-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3398; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3399; EG-NEXT:     MOV T2.X, PV.W,
3400; EG-NEXT:     MOV * T0.Y, T3.X,
3401; EG-NEXT:    ALU clause starting at 76:
3402; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3403; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3404; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3405; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3406; EG-NEXT:     MOV T3.X, PV.W,
3407; EG-NEXT:     MOV * T0.Y, T4.X,
3408; EG-NEXT:    ALU clause starting at 82:
3409; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3410; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3411; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3412; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3413; EG-NEXT:     MOV T4.X, PV.W,
3414; EG-NEXT:     MOV * T0.Y, T5.X,
3415; EG-NEXT:    ALU clause starting at 88:
3416; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3417; EG-NEXT:     LSHL * T1.W, T8.X, literal.y,
3418; EG-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3419; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3420; EG-NEXT:     MOV T5.X, PV.W,
3421; EG-NEXT:     MOV * T0.Y, T2.X,
3422; EG-NEXT:    ALU clause starting at 94:
3423; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3424; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3425; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3426; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3427; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3428; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3429; EG-NEXT:     MOV T2.X, PV.W,
3430; EG-NEXT:     MOV * T0.Y, T3.X,
3431; EG-NEXT:    ALU clause starting at 102:
3432; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3433; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3434; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3435; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3436; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3437; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3438; EG-NEXT:     MOV T3.X, PV.W,
3439; EG-NEXT:     MOV * T0.Y, T4.X,
3440; EG-NEXT:    ALU clause starting at 110:
3441; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3442; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3443; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3444; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3445; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3446; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3447; EG-NEXT:     MOV T4.X, PV.W,
3448; EG-NEXT:     MOV * T0.Y, T5.X,
3449; EG-NEXT:    ALU clause starting at 118:
3450; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3451; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3452; EG-NEXT:    255(3.573311e-43), -16711681(-1.714704e+38)
3453; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3454; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
3455; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3456; EG-NEXT:     MOV T5.X, PV.W,
3457; EG-NEXT:     MOV * T0.Y, T2.X,
3458; EG-NEXT:    ALU clause starting at 126:
3459; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3460; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3461; EG-NEXT:    255(3.573311e-43), -65281(nan)
3462; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3463; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3464; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3465; EG-NEXT:     MOV T2.X, PV.W,
3466; EG-NEXT:     MOV * T0.Y, T3.X,
3467; EG-NEXT:    ALU clause starting at 134:
3468; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3469; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3470; EG-NEXT:    255(3.573311e-43), -65281(nan)
3471; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3472; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3473; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3474; EG-NEXT:     MOV T3.X, PV.W,
3475; EG-NEXT:     MOV * T0.Y, T4.X,
3476; EG-NEXT:    ALU clause starting at 142:
3477; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3478; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3479; EG-NEXT:    255(3.573311e-43), -65281(nan)
3480; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3481; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3482; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3483; EG-NEXT:     MOV T4.X, PV.W,
3484; EG-NEXT:     MOV * T0.Y, T5.X,
3485; EG-NEXT:    ALU clause starting at 150:
3486; EG-NEXT:     AND_INT T0.W, T8.X, literal.x,
3487; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3488; EG-NEXT:    255(3.573311e-43), -65281(nan)
3489; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
3490; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
3491; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
3492; EG-NEXT:     MOV T5.X, PV.W,
3493; EG-NEXT:     MOV * T0.Y, T2.X,
3494; EG-NEXT:    ALU clause starting at 158:
3495; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3496; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3497; EG-NEXT:    -256(nan), 255(3.573311e-43)
3498; EG-NEXT:     OR_INT * T7.W, PV.W, PS,
3499; EG-NEXT:     MOV T2.X, PV.W,
3500; EG-NEXT:     MOV * T0.Y, T3.X,
3501; EG-NEXT:    ALU clause starting at 164:
3502; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3503; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3504; EG-NEXT:    -256(nan), 255(3.573311e-43)
3505; EG-NEXT:     OR_INT * T7.Z, PV.W, PS,
3506; EG-NEXT:     MOV T3.X, PV.Z,
3507; EG-NEXT:     MOV * T0.Y, T4.X,
3508; EG-NEXT:    ALU clause starting at 170:
3509; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3510; EG-NEXT:     AND_INT * T1.W, T8.X, literal.y,
3511; EG-NEXT:    -256(nan), 255(3.573311e-43)
3512; EG-NEXT:     OR_INT * T7.Y, PV.W, PS,
3513; EG-NEXT:     MOV T4.X, PV.Y,
3514; EG-NEXT:     MOV * T0.Y, T5.X,
3515; EG-NEXT:    ALU clause starting at 176:
3516; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3517; EG-NEXT:     AND_INT * T1.W, T7.X, literal.y,
3518; EG-NEXT:    -256(nan), 255(3.573311e-43)
3519; EG-NEXT:     OR_INT T7.X, PV.W, PS,
3520; EG-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3521; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3522;
3523; CM-LABEL: v16i8_arg:
3524; CM:       ; %bb.0: ; %entry
3525; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3526; CM-NEXT:    TEX 0 @36
3527; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3528; CM-NEXT:    TEX 0 @38
3529; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3530; CM-NEXT:    TEX 0 @40
3531; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3532; CM-NEXT:    TEX 0 @42
3533; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3534; CM-NEXT:    TEX 0 @44
3535; CM-NEXT:    ALU 7, @94, KC0[], KC1[]
3536; CM-NEXT:    TEX 0 @46
3537; CM-NEXT:    ALU 7, @102, KC0[], KC1[]
3538; CM-NEXT:    TEX 0 @48
3539; CM-NEXT:    ALU 7, @110, KC0[], KC1[]
3540; CM-NEXT:    TEX 0 @50
3541; CM-NEXT:    ALU 7, @118, KC0[], KC1[]
3542; CM-NEXT:    TEX 0 @52
3543; CM-NEXT:    ALU 7, @126, KC0[], KC1[]
3544; CM-NEXT:    TEX 0 @54
3545; CM-NEXT:    ALU 7, @134, KC0[], KC1[]
3546; CM-NEXT:    TEX 0 @56
3547; CM-NEXT:    ALU 7, @142, KC0[], KC1[]
3548; CM-NEXT:    TEX 0 @58
3549; CM-NEXT:    ALU 7, @150, KC0[], KC1[]
3550; CM-NEXT:    TEX 0 @60
3551; CM-NEXT:    ALU 5, @158, KC0[], KC1[]
3552; CM-NEXT:    TEX 0 @62
3553; CM-NEXT:    ALU 5, @164, KC0[], KC1[]
3554; CM-NEXT:    TEX 0 @64
3555; CM-NEXT:    ALU 5, @170, KC0[], KC1[]
3556; CM-NEXT:    TEX 0 @66
3557; CM-NEXT:    ALU 5, @176, KC0[CB0:0-32], KC1[]
3558; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
3559; CM-NEXT:    CF_END
3560; CM-NEXT:    PAD
3561; CM-NEXT:    Fetch clause starting at 36:
3562; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 67, #3
3563; CM-NEXT:    Fetch clause starting at 38:
3564; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 63, #3
3565; CM-NEXT:    Fetch clause starting at 40:
3566; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 59, #3
3567; CM-NEXT:    Fetch clause starting at 42:
3568; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 55, #3
3569; CM-NEXT:    Fetch clause starting at 44:
3570; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 66, #3
3571; CM-NEXT:    Fetch clause starting at 46:
3572; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 62, #3
3573; CM-NEXT:    Fetch clause starting at 48:
3574; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 58, #3
3575; CM-NEXT:    Fetch clause starting at 50:
3576; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 54, #3
3577; CM-NEXT:    Fetch clause starting at 52:
3578; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 65, #3
3579; CM-NEXT:    Fetch clause starting at 54:
3580; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 61, #3
3581; CM-NEXT:    Fetch clause starting at 56:
3582; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 57, #3
3583; CM-NEXT:    Fetch clause starting at 58:
3584; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 53, #3
3585; CM-NEXT:    Fetch clause starting at 60:
3586; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 64, #3
3587; CM-NEXT:    Fetch clause starting at 62:
3588; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 60, #3
3589; CM-NEXT:    Fetch clause starting at 64:
3590; CM-NEXT:     VTX_READ_8 T8.X, T7.X, 56, #3
3591; CM-NEXT:    Fetch clause starting at 66:
3592; CM-NEXT:     VTX_READ_8 T7.X, T7.X, 52, #3
3593; CM-NEXT:    ALU clause starting at 68:
3594; CM-NEXT:     MOV * T0.Y, T2.X,
3595; CM-NEXT:     MOV * T7.X, 0.0,
3596; CM-NEXT:    ALU clause starting at 70:
3597; CM-NEXT:     LSHL T0.Z, T8.X, literal.x,
3598; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
3599; CM-NEXT:    24(3.363116e-44), 16777215(2.350989e-38)
3600; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
3601; CM-NEXT:     MOV T2.X, PV.W,
3602; CM-NEXT:     MOV * T0.Y, T3.X,
3603; CM-NEXT:    ALU clause starting at 76:
3604; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3605; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3606; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3607; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3608; CM-NEXT:     MOV T3.X, PV.W,
3609; CM-NEXT:     MOV * T0.Y, T4.X,
3610; CM-NEXT:    ALU clause starting at 82:
3611; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3612; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3613; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3614; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3615; CM-NEXT:     MOV T4.X, PV.W,
3616; CM-NEXT:     MOV * T0.Y, T5.X,
3617; CM-NEXT:    ALU clause starting at 88:
3618; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3619; CM-NEXT:     LSHL * T0.W, T8.X, literal.y,
3620; CM-NEXT:    16777215(2.350989e-38), 24(3.363116e-44)
3621; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3622; CM-NEXT:     MOV T5.X, PV.W,
3623; CM-NEXT:     MOV * T0.Y, T2.X,
3624; CM-NEXT:    ALU clause starting at 94:
3625; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3626; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3627; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3628; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3629; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3630; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3631; CM-NEXT:     MOV T2.X, PV.W,
3632; CM-NEXT:     MOV * T0.Y, T3.X,
3633; CM-NEXT:    ALU clause starting at 102:
3634; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3635; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3636; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3637; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3638; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3639; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3640; CM-NEXT:     MOV T3.X, PV.W,
3641; CM-NEXT:     MOV * T0.Y, T4.X,
3642; CM-NEXT:    ALU clause starting at 110:
3643; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3644; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3645; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3646; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3647; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3648; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3649; CM-NEXT:     MOV T4.X, PV.W,
3650; CM-NEXT:     MOV * T0.Y, T5.X,
3651; CM-NEXT:    ALU clause starting at 118:
3652; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3653; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3654; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3655; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3656; CM-NEXT:    -16711681(-1.714704e+38), 16(2.242078e-44)
3657; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3658; CM-NEXT:     MOV T5.X, PV.W,
3659; CM-NEXT:     MOV * T0.Y, T2.X,
3660; CM-NEXT:    ALU clause starting at 126:
3661; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3662; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3663; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3664; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3665; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3666; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3667; CM-NEXT:     MOV T2.X, PV.W,
3668; CM-NEXT:     MOV * T0.Y, T3.X,
3669; CM-NEXT:    ALU clause starting at 134:
3670; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3671; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3672; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3673; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3674; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3675; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3676; CM-NEXT:     MOV T3.X, PV.W,
3677; CM-NEXT:     MOV * T0.Y, T4.X,
3678; CM-NEXT:    ALU clause starting at 142:
3679; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3680; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3681; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3682; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3683; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3684; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3685; CM-NEXT:     MOV T4.X, PV.W,
3686; CM-NEXT:     MOV * T0.Y, T5.X,
3687; CM-NEXT:    ALU clause starting at 150:
3688; CM-NEXT:     AND_INT * T0.W, T8.X, literal.x,
3689; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
3690; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3691; CM-NEXT:     LSHL * T0.W, PV.W, literal.y,
3692; CM-NEXT:    -65281(nan), 8(1.121039e-44)
3693; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
3694; CM-NEXT:     MOV T5.X, PV.W,
3695; CM-NEXT:     MOV * T0.Y, T2.X,
3696; CM-NEXT:    ALU clause starting at 158:
3697; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3698; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3699; CM-NEXT:    -256(nan), 255(3.573311e-43)
3700; CM-NEXT:     OR_INT * T7.W, PV.Z, PV.W,
3701; CM-NEXT:     MOV T2.X, PV.W,
3702; CM-NEXT:     MOV * T0.Y, T3.X,
3703; CM-NEXT:    ALU clause starting at 164:
3704; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3705; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3706; CM-NEXT:    -256(nan), 255(3.573311e-43)
3707; CM-NEXT:     OR_INT * T7.Z, PV.Z, PV.W,
3708; CM-NEXT:     MOV T3.X, PV.Z,
3709; CM-NEXT:     MOV * T0.Y, T4.X,
3710; CM-NEXT:    ALU clause starting at 170:
3711; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3712; CM-NEXT:     AND_INT * T0.W, T8.X, literal.y,
3713; CM-NEXT:    -256(nan), 255(3.573311e-43)
3714; CM-NEXT:     OR_INT * T7.Y, PV.Z, PV.W,
3715; CM-NEXT:     MOV T4.X, PV.Y,
3716; CM-NEXT:     MOV * T0.Y, T5.X,
3717; CM-NEXT:    ALU clause starting at 176:
3718; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
3719; CM-NEXT:     AND_INT * T0.W, T7.X, literal.y,
3720; CM-NEXT:    -256(nan), 255(3.573311e-43)
3721; CM-NEXT:     OR_INT * T7.X, PV.Z, PV.W,
3722; CM-NEXT:     LSHR * T8.X, KC0[2].Y, literal.x,
3723; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
3724entry:
3725  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
3726  ret void
3727}
3728
3729define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
3730; SI-LABEL: v16i16_arg:
3731; SI:       ; %bb.0: ; %entry
3732; SI-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x11
3733; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
3734; SI-NEXT:    s_mov_b32 s3, 0xf000
3735; SI-NEXT:    s_mov_b32 s2, -1
3736; SI-NEXT:    s_waitcnt lgkmcnt(0)
3737; SI-NEXT:    v_mov_b32_e32 v0, s8
3738; SI-NEXT:    v_mov_b32_e32 v1, s9
3739; SI-NEXT:    v_mov_b32_e32 v2, s10
3740; SI-NEXT:    v_mov_b32_e32 v3, s11
3741; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
3742; SI-NEXT:    s_waitcnt expcnt(0)
3743; SI-NEXT:    v_mov_b32_e32 v0, s4
3744; SI-NEXT:    v_mov_b32_e32 v1, s5
3745; SI-NEXT:    v_mov_b32_e32 v2, s6
3746; SI-NEXT:    v_mov_b32_e32 v3, s7
3747; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
3748; SI-NEXT:    s_endpgm
3749;
3750; VI-LABEL: v16i16_arg:
3751; VI:       ; %bb.0: ; %entry
3752; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
3753; VI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x44
3754; VI-NEXT:    s_waitcnt lgkmcnt(0)
3755; VI-NEXT:    v_mov_b32_e32 v0, s4
3756; VI-NEXT:    s_add_u32 s4, s8, 16
3757; VI-NEXT:    v_mov_b32_e32 v1, s5
3758; VI-NEXT:    s_addc_u32 s5, s9, 0
3759; VI-NEXT:    v_mov_b32_e32 v4, s4
3760; VI-NEXT:    v_mov_b32_e32 v2, s6
3761; VI-NEXT:    v_mov_b32_e32 v3, s7
3762; VI-NEXT:    v_mov_b32_e32 v5, s5
3763; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3764; VI-NEXT:    v_mov_b32_e32 v4, s8
3765; VI-NEXT:    v_mov_b32_e32 v0, s0
3766; VI-NEXT:    v_mov_b32_e32 v1, s1
3767; VI-NEXT:    v_mov_b32_e32 v2, s2
3768; VI-NEXT:    v_mov_b32_e32 v3, s3
3769; VI-NEXT:    v_mov_b32_e32 v5, s9
3770; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
3771; VI-NEXT:    s_endpgm
3772;
3773; GFX9-LABEL: v16i16_arg:
3774; GFX9:       ; %bb.0: ; %entry
3775; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
3776; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
3777; GFX9-NEXT:    v_mov_b32_e32 v4, 0
3778; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3779; GFX9-NEXT:    v_mov_b32_e32 v0, s12
3780; GFX9-NEXT:    v_mov_b32_e32 v1, s13
3781; GFX9-NEXT:    v_mov_b32_e32 v2, s14
3782; GFX9-NEXT:    v_mov_b32_e32 v3, s15
3783; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
3784; GFX9-NEXT:    s_nop 0
3785; GFX9-NEXT:    v_mov_b32_e32 v0, s8
3786; GFX9-NEXT:    v_mov_b32_e32 v1, s9
3787; GFX9-NEXT:    v_mov_b32_e32 v2, s10
3788; GFX9-NEXT:    v_mov_b32_e32 v3, s11
3789; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
3790; GFX9-NEXT:    s_endpgm
3791;
3792; EG-LABEL: v16i16_arg:
3793; EG:       ; %bb.0: ; %entry
3794; EG-NEXT:    ALU 1, @68, KC0[], KC1[]
3795; EG-NEXT:    TEX 0 @36
3796; EG-NEXT:    ALU 5, @70, KC0[], KC1[]
3797; EG-NEXT:    TEX 0 @38
3798; EG-NEXT:    ALU 5, @76, KC0[], KC1[]
3799; EG-NEXT:    TEX 0 @40
3800; EG-NEXT:    ALU 5, @82, KC0[], KC1[]
3801; EG-NEXT:    TEX 0 @42
3802; EG-NEXT:    ALU 5, @88, KC0[], KC1[]
3803; EG-NEXT:    TEX 0 @44
3804; EG-NEXT:    ALU 5, @94, KC0[], KC1[]
3805; EG-NEXT:    TEX 0 @46
3806; EG-NEXT:    ALU 5, @100, KC0[], KC1[]
3807; EG-NEXT:    TEX 0 @48
3808; EG-NEXT:    ALU 5, @106, KC0[], KC1[]
3809; EG-NEXT:    TEX 0 @50
3810; EG-NEXT:    ALU 5, @112, KC0[], KC1[]
3811; EG-NEXT:    TEX 0 @52
3812; EG-NEXT:    ALU 5, @118, KC0[], KC1[]
3813; EG-NEXT:    TEX 0 @54
3814; EG-NEXT:    ALU 5, @124, KC0[], KC1[]
3815; EG-NEXT:    TEX 0 @56
3816; EG-NEXT:    ALU 5, @130, KC0[], KC1[]
3817; EG-NEXT:    TEX 0 @58
3818; EG-NEXT:    ALU 5, @136, KC0[], KC1[]
3819; EG-NEXT:    TEX 0 @60
3820; EG-NEXT:    ALU 5, @142, KC0[], KC1[]
3821; EG-NEXT:    TEX 0 @62
3822; EG-NEXT:    ALU 5, @148, KC0[], KC1[]
3823; EG-NEXT:    TEX 0 @64
3824; EG-NEXT:    ALU 5, @154, KC0[], KC1[]
3825; EG-NEXT:    TEX 0 @66
3826; EG-NEXT:    ALU 13, @160, KC0[CB0:0-32], KC1[]
3827; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
3828; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 1
3829; EG-NEXT:    CF_END
3830; EG-NEXT:    Fetch clause starting at 36:
3831; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
3832; EG-NEXT:    Fetch clause starting at 38:
3833; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
3834; EG-NEXT:    Fetch clause starting at 40:
3835; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
3836; EG-NEXT:    Fetch clause starting at 42:
3837; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
3838; EG-NEXT:    Fetch clause starting at 44:
3839; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
3840; EG-NEXT:    Fetch clause starting at 46:
3841; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
3842; EG-NEXT:    Fetch clause starting at 48:
3843; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
3844; EG-NEXT:    Fetch clause starting at 50:
3845; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
3846; EG-NEXT:    Fetch clause starting at 52:
3847; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
3848; EG-NEXT:    Fetch clause starting at 54:
3849; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
3850; EG-NEXT:    Fetch clause starting at 56:
3851; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
3852; EG-NEXT:    Fetch clause starting at 58:
3853; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
3854; EG-NEXT:    Fetch clause starting at 60:
3855; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
3856; EG-NEXT:    Fetch clause starting at 62:
3857; EG-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
3858; EG-NEXT:    Fetch clause starting at 64:
3859; EG-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
3860; EG-NEXT:    Fetch clause starting at 66:
3861; EG-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
3862; EG-NEXT:    ALU clause starting at 68:
3863; EG-NEXT:     MOV * T0.Y, T3.X,
3864; EG-NEXT:     MOV * T11.X, 0.0,
3865; EG-NEXT:    ALU clause starting at 70:
3866; EG-NEXT:     LSHL T0.W, T12.X, literal.x,
3867; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
3868; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
3869; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
3870; EG-NEXT:     MOV T3.X, PV.W,
3871; EG-NEXT:     MOV * T0.Y, T5.X,
3872; EG-NEXT:    ALU clause starting at 76:
3873; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3874; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3875; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3876; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3877; EG-NEXT:     MOV T5.X, PV.W,
3878; EG-NEXT:     MOV * T0.Y, T7.X,
3879; EG-NEXT:    ALU clause starting at 82:
3880; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3881; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3882; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3883; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3884; EG-NEXT:     MOV T7.X, PV.W,
3885; EG-NEXT:     MOV * T0.Y, T9.X,
3886; EG-NEXT:    ALU clause starting at 88:
3887; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3888; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3889; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3890; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3891; EG-NEXT:     MOV T9.X, PV.W,
3892; EG-NEXT:     MOV * T0.Y, T3.X,
3893; EG-NEXT:    ALU clause starting at 94:
3894; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3895; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3896; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3897; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3898; EG-NEXT:     MOV T3.X, PV.W,
3899; EG-NEXT:     MOV * T0.Y, T5.X,
3900; EG-NEXT:    ALU clause starting at 100:
3901; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3902; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3903; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3904; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3905; EG-NEXT:     MOV T5.X, PV.W,
3906; EG-NEXT:     MOV * T0.Y, T7.X,
3907; EG-NEXT:    ALU clause starting at 106:
3908; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3909; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3910; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3911; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3912; EG-NEXT:     MOV T7.X, PV.W,
3913; EG-NEXT:     MOV * T0.Y, T9.X,
3914; EG-NEXT:    ALU clause starting at 112:
3915; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3916; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3917; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3918; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3919; EG-NEXT:     MOV T9.X, PV.W,
3920; EG-NEXT:     MOV * T0.Y, T2.X,
3921; EG-NEXT:    ALU clause starting at 118:
3922; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3923; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3924; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3925; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3926; EG-NEXT:     MOV T2.X, PV.W,
3927; EG-NEXT:     MOV * T0.Y, T4.X,
3928; EG-NEXT:    ALU clause starting at 124:
3929; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3930; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3931; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3932; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3933; EG-NEXT:     MOV T4.X, PV.W,
3934; EG-NEXT:     MOV * T0.Y, T6.X,
3935; EG-NEXT:    ALU clause starting at 130:
3936; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3937; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3938; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3939; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3940; EG-NEXT:     MOV T6.X, PV.W,
3941; EG-NEXT:     MOV * T0.Y, T8.X,
3942; EG-NEXT:    ALU clause starting at 136:
3943; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3944; EG-NEXT:     LSHL * T1.W, T12.X, literal.y,
3945; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
3946; EG-NEXT:     OR_INT * T0.W, PV.W, PS,
3947; EG-NEXT:     MOV T8.X, PV.W,
3948; EG-NEXT:     MOV * T0.Y, T2.X,
3949; EG-NEXT:    ALU clause starting at 142:
3950; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3951; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3952; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3953; EG-NEXT:     OR_INT * T12.Z, PV.W, PS,
3954; EG-NEXT:     MOV T2.X, PV.Z,
3955; EG-NEXT:     MOV * T0.Y, T4.X,
3956; EG-NEXT:    ALU clause starting at 148:
3957; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3958; EG-NEXT:     AND_INT * T1.W, T12.X, literal.y,
3959; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3960; EG-NEXT:     OR_INT * T12.X, PV.W, PS,
3961; EG-NEXT:     MOV T4.X, PV.X,
3962; EG-NEXT:     MOV * T0.Y, T6.X,
3963; EG-NEXT:    ALU clause starting at 154:
3964; EG-NEXT:     AND_INT T0.W, T0.Y, literal.x,
3965; EG-NEXT:     AND_INT * T1.W, T13.X, literal.y,
3966; EG-NEXT:    -65536(nan), 65535(9.183409e-41)
3967; EG-NEXT:     OR_INT * T11.Z, PV.W, PS,
3968; EG-NEXT:     MOV T6.X, PV.Z,
3969; EG-NEXT:     MOV * T0.Y, T8.X,
3970; EG-NEXT:    ALU clause starting at 160:
3971; EG-NEXT:     LSHR T13.X, KC0[2].Y, literal.x,
3972; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
3973; EG-NEXT:    2(2.802597e-45), 16(2.242078e-44)
3974; EG-NEXT:     LSHR T14.X, PV.W, literal.x,
3975; EG-NEXT:     AND_INT T0.W, T0.Y, literal.y,
3976; EG-NEXT:     AND_INT * T1.W, T11.X, literal.z,
3977; EG-NEXT:    2(2.802597e-45), -65536(nan)
3978; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
3979; EG-NEXT:     OR_INT * T11.X, PV.W, PS,
3980; EG-NEXT:     MOV T8.X, PV.X,
3981; EG-NEXT:     MOV * T12.W, T3.X,
3982; EG-NEXT:     MOV T12.Y, T5.X,
3983; EG-NEXT:     MOV T11.W, T7.X, BS:VEC_120/SCL_212
3984; EG-NEXT:     MOV * T11.Y, T9.X,
3985;
3986; CM-LABEL: v16i16_arg:
3987; CM:       ; %bb.0: ; %entry
3988; CM-NEXT:    ALU 1, @68, KC0[], KC1[]
3989; CM-NEXT:    TEX 0 @36
3990; CM-NEXT:    ALU 5, @70, KC0[], KC1[]
3991; CM-NEXT:    TEX 0 @38
3992; CM-NEXT:    ALU 5, @76, KC0[], KC1[]
3993; CM-NEXT:    TEX 0 @40
3994; CM-NEXT:    ALU 5, @82, KC0[], KC1[]
3995; CM-NEXT:    TEX 0 @42
3996; CM-NEXT:    ALU 5, @88, KC0[], KC1[]
3997; CM-NEXT:    TEX 0 @44
3998; CM-NEXT:    ALU 5, @94, KC0[], KC1[]
3999; CM-NEXT:    TEX 0 @46
4000; CM-NEXT:    ALU 5, @100, KC0[], KC1[]
4001; CM-NEXT:    TEX 0 @48
4002; CM-NEXT:    ALU 5, @106, KC0[], KC1[]
4003; CM-NEXT:    TEX 0 @50
4004; CM-NEXT:    ALU 5, @112, KC0[], KC1[]
4005; CM-NEXT:    TEX 0 @52
4006; CM-NEXT:    ALU 5, @118, KC0[], KC1[]
4007; CM-NEXT:    TEX 0 @54
4008; CM-NEXT:    ALU 5, @124, KC0[], KC1[]
4009; CM-NEXT:    TEX 0 @56
4010; CM-NEXT:    ALU 5, @130, KC0[], KC1[]
4011; CM-NEXT:    TEX 0 @58
4012; CM-NEXT:    ALU 5, @136, KC0[], KC1[]
4013; CM-NEXT:    TEX 0 @60
4014; CM-NEXT:    ALU 5, @142, KC0[], KC1[]
4015; CM-NEXT:    TEX 0 @62
4016; CM-NEXT:    ALU 5, @148, KC0[], KC1[]
4017; CM-NEXT:    TEX 0 @64
4018; CM-NEXT:    ALU 5, @154, KC0[], KC1[]
4019; CM-NEXT:    TEX 0 @66
4020; CM-NEXT:    ALU 14, @160, KC0[CB0:0-32], KC1[]
4021; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
4022; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T12, T13.X
4023; CM-NEXT:    CF_END
4024; CM-NEXT:    Fetch clause starting at 36:
4025; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 98, #3
4026; CM-NEXT:    Fetch clause starting at 38:
4027; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 90, #3
4028; CM-NEXT:    Fetch clause starting at 40:
4029; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 82, #3
4030; CM-NEXT:    Fetch clause starting at 42:
4031; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 74, #3
4032; CM-NEXT:    Fetch clause starting at 44:
4033; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 96, #3
4034; CM-NEXT:    Fetch clause starting at 46:
4035; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 88, #3
4036; CM-NEXT:    Fetch clause starting at 48:
4037; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 80, #3
4038; CM-NEXT:    Fetch clause starting at 50:
4039; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 72, #3
4040; CM-NEXT:    Fetch clause starting at 52:
4041; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 94, #3
4042; CM-NEXT:    Fetch clause starting at 54:
4043; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 86, #3
4044; CM-NEXT:    Fetch clause starting at 56:
4045; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 78, #3
4046; CM-NEXT:    Fetch clause starting at 58:
4047; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 70, #3
4048; CM-NEXT:    Fetch clause starting at 60:
4049; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 92, #3
4050; CM-NEXT:    Fetch clause starting at 62:
4051; CM-NEXT:     VTX_READ_16 T12.X, T11.X, 84, #3
4052; CM-NEXT:    Fetch clause starting at 64:
4053; CM-NEXT:     VTX_READ_16 T13.X, T11.X, 76, #3
4054; CM-NEXT:    Fetch clause starting at 66:
4055; CM-NEXT:     VTX_READ_16 T11.X, T11.X, 68, #3
4056; CM-NEXT:    ALU clause starting at 68:
4057; CM-NEXT:     MOV * T0.Y, T3.X,
4058; CM-NEXT:     MOV * T11.X, 0.0,
4059; CM-NEXT:    ALU clause starting at 70:
4060; CM-NEXT:     LSHL T0.Z, T12.X, literal.x,
4061; CM-NEXT:     AND_INT * T0.W, T0.Y, literal.y,
4062; CM-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
4063; CM-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
4064; CM-NEXT:     MOV T3.X, PV.W,
4065; CM-NEXT:     MOV * T0.Y, T5.X,
4066; CM-NEXT:    ALU clause starting at 76:
4067; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4068; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4069; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4070; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4071; CM-NEXT:     MOV T5.X, PV.W,
4072; CM-NEXT:     MOV * T0.Y, T7.X,
4073; CM-NEXT:    ALU clause starting at 82:
4074; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4075; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4076; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4077; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4078; CM-NEXT:     MOV T7.X, PV.W,
4079; CM-NEXT:     MOV * T0.Y, T9.X,
4080; CM-NEXT:    ALU clause starting at 88:
4081; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4082; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4083; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4084; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4085; CM-NEXT:     MOV T9.X, PV.W,
4086; CM-NEXT:     MOV * T0.Y, T3.X,
4087; CM-NEXT:    ALU clause starting at 94:
4088; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4089; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4090; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4091; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4092; CM-NEXT:     MOV T3.X, PV.W,
4093; CM-NEXT:     MOV * T0.Y, T5.X,
4094; CM-NEXT:    ALU clause starting at 100:
4095; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4096; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4097; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4098; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4099; CM-NEXT:     MOV T5.X, PV.W,
4100; CM-NEXT:     MOV * T0.Y, T7.X,
4101; CM-NEXT:    ALU clause starting at 106:
4102; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4103; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4104; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4105; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4106; CM-NEXT:     MOV T7.X, PV.W,
4107; CM-NEXT:     MOV * T0.Y, T9.X,
4108; CM-NEXT:    ALU clause starting at 112:
4109; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4110; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4111; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4112; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4113; CM-NEXT:     MOV T9.X, PV.W,
4114; CM-NEXT:     MOV * T0.Y, T2.X,
4115; CM-NEXT:    ALU clause starting at 118:
4116; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4117; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4118; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4119; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4120; CM-NEXT:     MOV T2.X, PV.W,
4121; CM-NEXT:     MOV * T0.Y, T4.X,
4122; CM-NEXT:    ALU clause starting at 124:
4123; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4124; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4125; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4126; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4127; CM-NEXT:     MOV T4.X, PV.W,
4128; CM-NEXT:     MOV * T0.Y, T6.X,
4129; CM-NEXT:    ALU clause starting at 130:
4130; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4131; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4132; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4133; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4134; CM-NEXT:     MOV T6.X, PV.W,
4135; CM-NEXT:     MOV * T0.Y, T8.X,
4136; CM-NEXT:    ALU clause starting at 136:
4137; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4138; CM-NEXT:     LSHL * T0.W, T12.X, literal.y,
4139; CM-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
4140; CM-NEXT:     OR_INT * T0.W, PV.Z, PV.W,
4141; CM-NEXT:     MOV T8.X, PV.W,
4142; CM-NEXT:     MOV * T0.Y, T2.X,
4143; CM-NEXT:    ALU clause starting at 142:
4144; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4145; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4146; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4147; CM-NEXT:     OR_INT * T12.Z, PV.Z, PV.W,
4148; CM-NEXT:     MOV T2.X, PV.Z,
4149; CM-NEXT:     MOV * T0.Y, T4.X,
4150; CM-NEXT:    ALU clause starting at 148:
4151; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4152; CM-NEXT:     AND_INT * T0.W, T12.X, literal.y,
4153; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4154; CM-NEXT:     OR_INT * T12.X, PV.Z, PV.W,
4155; CM-NEXT:     MOV T4.X, PV.X,
4156; CM-NEXT:     MOV * T0.Y, T6.X,
4157; CM-NEXT:    ALU clause starting at 154:
4158; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.x,
4159; CM-NEXT:     AND_INT * T0.W, T13.X, literal.y,
4160; CM-NEXT:    -65536(nan), 65535(9.183409e-41)
4161; CM-NEXT:     OR_INT * T11.Z, PV.Z, PV.W,
4162; CM-NEXT:     MOV T6.X, PV.Z,
4163; CM-NEXT:     MOV * T0.Y, T8.X,
4164; CM-NEXT:    ALU clause starting at 160:
4165; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4166; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4167; CM-NEXT:     LSHR * T13.X, PV.W, literal.x,
4168; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4169; CM-NEXT:     LSHR T14.X, KC0[2].Y, literal.x,
4170; CM-NEXT:     AND_INT T0.Z, T0.Y, literal.y,
4171; CM-NEXT:     AND_INT * T0.W, T11.X, literal.z,
4172; CM-NEXT:    2(2.802597e-45), -65536(nan)
4173; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
4174; CM-NEXT:     OR_INT * T11.X, PV.Z, PV.W,
4175; CM-NEXT:     MOV T8.X, PV.X,
4176; CM-NEXT:     MOV * T12.W, T3.X,
4177; CM-NEXT:     MOV T12.Y, T5.X,
4178; CM-NEXT:     MOV * T11.W, T7.X, BS:VEC_120/SCL_212
4179; CM-NEXT:     MOV * T11.Y, T9.X,
4180entry:
4181  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
4182  ret void
4183}
4184
4185define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
4186; SI-LABEL: v16i32_arg:
4187; SI:       ; %bb.0: ; %entry
4188; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
4189; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4190; SI-NEXT:    s_mov_b32 s3, 0xf000
4191; SI-NEXT:    s_mov_b32 s2, -1
4192; SI-NEXT:    s_waitcnt lgkmcnt(0)
4193; SI-NEXT:    v_mov_b32_e32 v0, s16
4194; SI-NEXT:    v_mov_b32_e32 v1, s17
4195; SI-NEXT:    v_mov_b32_e32 v2, s18
4196; SI-NEXT:    v_mov_b32_e32 v3, s19
4197; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4198; SI-NEXT:    s_waitcnt expcnt(0)
4199; SI-NEXT:    v_mov_b32_e32 v0, s12
4200; SI-NEXT:    v_mov_b32_e32 v1, s13
4201; SI-NEXT:    v_mov_b32_e32 v2, s14
4202; SI-NEXT:    v_mov_b32_e32 v3, s15
4203; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4204; SI-NEXT:    s_waitcnt expcnt(0)
4205; SI-NEXT:    v_mov_b32_e32 v0, s8
4206; SI-NEXT:    v_mov_b32_e32 v1, s9
4207; SI-NEXT:    v_mov_b32_e32 v2, s10
4208; SI-NEXT:    v_mov_b32_e32 v3, s11
4209; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4210; SI-NEXT:    s_waitcnt expcnt(0)
4211; SI-NEXT:    v_mov_b32_e32 v0, s4
4212; SI-NEXT:    v_mov_b32_e32 v1, s5
4213; SI-NEXT:    v_mov_b32_e32 v2, s6
4214; SI-NEXT:    v_mov_b32_e32 v3, s7
4215; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4216; SI-NEXT:    s_endpgm
4217;
4218; VI-LABEL: v16i32_arg:
4219; VI:       ; %bb.0: ; %entry
4220; VI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
4221; VI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x64
4222; VI-NEXT:    s_waitcnt lgkmcnt(0)
4223; VI-NEXT:    v_mov_b32_e32 v0, s12
4224; VI-NEXT:    s_add_u32 s12, s16, 48
4225; VI-NEXT:    v_mov_b32_e32 v1, s13
4226; VI-NEXT:    s_addc_u32 s13, s17, 0
4227; VI-NEXT:    v_mov_b32_e32 v4, s12
4228; VI-NEXT:    v_mov_b32_e32 v2, s14
4229; VI-NEXT:    v_mov_b32_e32 v3, s15
4230; VI-NEXT:    v_mov_b32_e32 v5, s13
4231; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4232; VI-NEXT:    s_nop 0
4233; VI-NEXT:    v_mov_b32_e32 v0, s8
4234; VI-NEXT:    s_add_u32 s8, s16, 32
4235; VI-NEXT:    v_mov_b32_e32 v1, s9
4236; VI-NEXT:    s_addc_u32 s9, s17, 0
4237; VI-NEXT:    v_mov_b32_e32 v4, s8
4238; VI-NEXT:    v_mov_b32_e32 v2, s10
4239; VI-NEXT:    v_mov_b32_e32 v3, s11
4240; VI-NEXT:    v_mov_b32_e32 v5, s9
4241; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4242; VI-NEXT:    s_nop 0
4243; VI-NEXT:    v_mov_b32_e32 v0, s4
4244; VI-NEXT:    s_add_u32 s4, s16, 16
4245; VI-NEXT:    v_mov_b32_e32 v1, s5
4246; VI-NEXT:    s_addc_u32 s5, s17, 0
4247; VI-NEXT:    v_mov_b32_e32 v4, s4
4248; VI-NEXT:    v_mov_b32_e32 v2, s6
4249; VI-NEXT:    v_mov_b32_e32 v3, s7
4250; VI-NEXT:    v_mov_b32_e32 v5, s5
4251; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4252; VI-NEXT:    v_mov_b32_e32 v4, s16
4253; VI-NEXT:    v_mov_b32_e32 v0, s0
4254; VI-NEXT:    v_mov_b32_e32 v1, s1
4255; VI-NEXT:    v_mov_b32_e32 v2, s2
4256; VI-NEXT:    v_mov_b32_e32 v3, s3
4257; VI-NEXT:    v_mov_b32_e32 v5, s17
4258; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4259; VI-NEXT:    s_endpgm
4260;
4261; GFX9-LABEL: v16i32_arg:
4262; GFX9:       ; %bb.0: ; %entry
4263; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
4264; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4265; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x40
4266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4267; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4268; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4269; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4270; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4271; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
4272; GFX9-NEXT:    s_nop 0
4273; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4274; GFX9-NEXT:    v_mov_b32_e32 v1, s9
4275; GFX9-NEXT:    v_mov_b32_e32 v2, s10
4276; GFX9-NEXT:    v_mov_b32_e32 v3, s11
4277; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
4278; GFX9-NEXT:    s_nop 0
4279; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4280; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4281; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4282; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4283; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
4284; GFX9-NEXT:    s_nop 0
4285; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4286; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4287; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4288; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4289; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
4290; GFX9-NEXT:    s_endpgm
4291;
4292; EG-LABEL: v16i32_arg:
4293; EG:       ; %bb.0: ; %entry
4294; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4295; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4296; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4297; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4298; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4299; EG-NEXT:    CF_END
4300; EG-NEXT:    ALU clause starting at 6:
4301; EG-NEXT:     MOV * T0.W, KC0[7].X,
4302; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4303; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4304; EG-NEXT:     MOV * T1.W, KC0[8].X,
4305; EG-NEXT:     MOV T0.X, KC0[6].Y,
4306; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4307; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4308; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4309; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4310; EG-NEXT:     MOV * T3.W, KC0[9].X,
4311; EG-NEXT:     MOV T1.X, KC0[7].Y,
4312; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4313; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4314; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4315; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4316; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4317; EG-NEXT:     MOV * T5.W, KC0[10].X,
4318; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4319; EG-NEXT:     MOV T3.X, KC0[8].Y,
4320; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4321; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4322; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4323; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4324; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4325; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4326; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4327; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4328; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4329; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4330; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4331;
4332; CM-LABEL: v16i32_arg:
4333; CM:       ; %bb.0: ; %entry
4334; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4335; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4336; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4337; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4338; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4339; CM-NEXT:    CF_END
4340; CM-NEXT:    ALU clause starting at 6:
4341; CM-NEXT:     MOV * T0.W, KC0[10].X,
4342; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4343; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4344; CM-NEXT:     MOV T0.X, KC0[9].Y,
4345; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4346; CM-NEXT:     MOV * T2.W, KC0[9].X,
4347; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4348; CM-NEXT:     MOV T2.Z, KC0[8].W,
4349; CM-NEXT:     MOV * T1.W, KC0[8].X,
4350; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4351; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4352; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4353; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4354; CM-NEXT:     MOV T2.X, KC0[8].Y,
4355; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4356; CM-NEXT:     MOV T1.X, KC0[7].Y,
4357; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4358; CM-NEXT:     MOV * T4.W, KC0[7].X,
4359; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4360; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4361; CM-NEXT:     MOV T4.Z, KC0[6].W,
4362; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4363; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4364; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4365; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4366; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4367; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4368; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4369; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4370entry:
4371  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
4372  ret void
4373}
4374
4375define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
4376; SI-LABEL: v16f32_arg:
4377; SI:       ; %bb.0: ; %entry
4378; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
4379; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4380; SI-NEXT:    s_mov_b32 s3, 0xf000
4381; SI-NEXT:    s_mov_b32 s2, -1
4382; SI-NEXT:    s_waitcnt lgkmcnt(0)
4383; SI-NEXT:    v_mov_b32_e32 v0, s16
4384; SI-NEXT:    v_mov_b32_e32 v1, s17
4385; SI-NEXT:    v_mov_b32_e32 v2, s18
4386; SI-NEXT:    v_mov_b32_e32 v3, s19
4387; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
4388; SI-NEXT:    s_waitcnt expcnt(0)
4389; SI-NEXT:    v_mov_b32_e32 v0, s12
4390; SI-NEXT:    v_mov_b32_e32 v1, s13
4391; SI-NEXT:    v_mov_b32_e32 v2, s14
4392; SI-NEXT:    v_mov_b32_e32 v3, s15
4393; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
4394; SI-NEXT:    s_waitcnt expcnt(0)
4395; SI-NEXT:    v_mov_b32_e32 v0, s8
4396; SI-NEXT:    v_mov_b32_e32 v1, s9
4397; SI-NEXT:    v_mov_b32_e32 v2, s10
4398; SI-NEXT:    v_mov_b32_e32 v3, s11
4399; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
4400; SI-NEXT:    s_waitcnt expcnt(0)
4401; SI-NEXT:    v_mov_b32_e32 v0, s4
4402; SI-NEXT:    v_mov_b32_e32 v1, s5
4403; SI-NEXT:    v_mov_b32_e32 v2, s6
4404; SI-NEXT:    v_mov_b32_e32 v3, s7
4405; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
4406; SI-NEXT:    s_endpgm
4407;
4408; VI-LABEL: v16f32_arg:
4409; VI:       ; %bb.0: ; %entry
4410; VI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
4411; VI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x64
4412; VI-NEXT:    s_waitcnt lgkmcnt(0)
4413; VI-NEXT:    v_mov_b32_e32 v0, s12
4414; VI-NEXT:    s_add_u32 s12, s16, 48
4415; VI-NEXT:    v_mov_b32_e32 v1, s13
4416; VI-NEXT:    s_addc_u32 s13, s17, 0
4417; VI-NEXT:    v_mov_b32_e32 v4, s12
4418; VI-NEXT:    v_mov_b32_e32 v2, s14
4419; VI-NEXT:    v_mov_b32_e32 v3, s15
4420; VI-NEXT:    v_mov_b32_e32 v5, s13
4421; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4422; VI-NEXT:    s_nop 0
4423; VI-NEXT:    v_mov_b32_e32 v0, s8
4424; VI-NEXT:    s_add_u32 s8, s16, 32
4425; VI-NEXT:    v_mov_b32_e32 v1, s9
4426; VI-NEXT:    s_addc_u32 s9, s17, 0
4427; VI-NEXT:    v_mov_b32_e32 v4, s8
4428; VI-NEXT:    v_mov_b32_e32 v2, s10
4429; VI-NEXT:    v_mov_b32_e32 v3, s11
4430; VI-NEXT:    v_mov_b32_e32 v5, s9
4431; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4432; VI-NEXT:    s_nop 0
4433; VI-NEXT:    v_mov_b32_e32 v0, s4
4434; VI-NEXT:    s_add_u32 s4, s16, 16
4435; VI-NEXT:    v_mov_b32_e32 v1, s5
4436; VI-NEXT:    s_addc_u32 s5, s17, 0
4437; VI-NEXT:    v_mov_b32_e32 v4, s4
4438; VI-NEXT:    v_mov_b32_e32 v2, s6
4439; VI-NEXT:    v_mov_b32_e32 v3, s7
4440; VI-NEXT:    v_mov_b32_e32 v5, s5
4441; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4442; VI-NEXT:    v_mov_b32_e32 v4, s16
4443; VI-NEXT:    v_mov_b32_e32 v0, s0
4444; VI-NEXT:    v_mov_b32_e32 v1, s1
4445; VI-NEXT:    v_mov_b32_e32 v2, s2
4446; VI-NEXT:    v_mov_b32_e32 v3, s3
4447; VI-NEXT:    v_mov_b32_e32 v5, s17
4448; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
4449; VI-NEXT:    s_endpgm
4450;
4451; GFX9-LABEL: v16f32_arg:
4452; GFX9:       ; %bb.0: ; %entry
4453; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
4454; GFX9-NEXT:    v_mov_b32_e32 v4, 0
4455; GFX9-NEXT:    s_load_dwordx16 s[0:15], s[4:5], 0x40
4456; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4457; GFX9-NEXT:    v_mov_b32_e32 v0, s12
4458; GFX9-NEXT:    v_mov_b32_e32 v1, s13
4459; GFX9-NEXT:    v_mov_b32_e32 v2, s14
4460; GFX9-NEXT:    v_mov_b32_e32 v3, s15
4461; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:48
4462; GFX9-NEXT:    s_nop 0
4463; GFX9-NEXT:    v_mov_b32_e32 v0, s8
4464; GFX9-NEXT:    v_mov_b32_e32 v1, s9
4465; GFX9-NEXT:    v_mov_b32_e32 v2, s10
4466; GFX9-NEXT:    v_mov_b32_e32 v3, s11
4467; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:32
4468; GFX9-NEXT:    s_nop 0
4469; GFX9-NEXT:    v_mov_b32_e32 v0, s4
4470; GFX9-NEXT:    v_mov_b32_e32 v1, s5
4471; GFX9-NEXT:    v_mov_b32_e32 v2, s6
4472; GFX9-NEXT:    v_mov_b32_e32 v3, s7
4473; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17] offset:16
4474; GFX9-NEXT:    s_nop 0
4475; GFX9-NEXT:    v_mov_b32_e32 v0, s0
4476; GFX9-NEXT:    v_mov_b32_e32 v1, s1
4477; GFX9-NEXT:    v_mov_b32_e32 v2, s2
4478; GFX9-NEXT:    v_mov_b32_e32 v3, s3
4479; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
4480; GFX9-NEXT:    s_endpgm
4481;
4482; EG-LABEL: v16f32_arg:
4483; EG:       ; %bb.0: ; %entry
4484; EG-NEXT:    ALU 29, @6, KC0[CB0:0-32], KC1[]
4485; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
4486; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T6.X, 0
4487; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T4.X, 0
4488; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
4489; EG-NEXT:    CF_END
4490; EG-NEXT:    ALU clause starting at 6:
4491; EG-NEXT:     MOV * T0.W, KC0[7].X,
4492; EG-NEXT:     MOV * T0.Z, KC0[6].W,
4493; EG-NEXT:     MOV T0.Y, KC0[6].Z,
4494; EG-NEXT:     MOV * T1.W, KC0[8].X,
4495; EG-NEXT:     MOV T0.X, KC0[6].Y,
4496; EG-NEXT:     MOV * T1.Z, KC0[7].W,
4497; EG-NEXT:     LSHR T2.X, KC0[2].Y, literal.x,
4498; EG-NEXT:     MOV * T1.Y, KC0[7].Z,
4499; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4500; EG-NEXT:     MOV * T3.W, KC0[9].X,
4501; EG-NEXT:     MOV T1.X, KC0[7].Y,
4502; EG-NEXT:     MOV * T3.Z, KC0[8].W,
4503; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4504; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
4505; EG-NEXT:     LSHR T4.X, PV.W, literal.x,
4506; EG-NEXT:     MOV T3.Y, KC0[8].Z,
4507; EG-NEXT:     MOV * T5.W, KC0[10].X,
4508; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4509; EG-NEXT:     MOV T3.X, KC0[8].Y,
4510; EG-NEXT:     MOV * T5.Z, KC0[9].W,
4511; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4512; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4513; EG-NEXT:     LSHR T6.X, PV.W, literal.x,
4514; EG-NEXT:     MOV T5.Y, KC0[9].Z,
4515; EG-NEXT:     MOV * T5.X, KC0[9].Y,
4516; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4517; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4518; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4519; EG-NEXT:     LSHR * T7.X, PV.W, literal.x,
4520; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4521;
4522; CM-LABEL: v16f32_arg:
4523; CM:       ; %bb.0: ; %entry
4524; CM-NEXT:    ALU 28, @6, KC0[CB0:0-32], KC1[]
4525; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
4526; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T6.X
4527; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T5.X
4528; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
4529; CM-NEXT:    CF_END
4530; CM-NEXT:    ALU clause starting at 6:
4531; CM-NEXT:     MOV * T0.W, KC0[10].X,
4532; CM-NEXT:     MOV * T0.Z, KC0[9].W,
4533; CM-NEXT:     MOV * T0.Y, KC0[9].Z,
4534; CM-NEXT:     MOV T0.X, KC0[9].Y,
4535; CM-NEXT:     ADD_INT T1.Z, KC0[2].Y, literal.x,
4536; CM-NEXT:     MOV * T2.W, KC0[9].X,
4537; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
4538; CM-NEXT:     MOV T2.Z, KC0[8].W,
4539; CM-NEXT:     MOV * T1.W, KC0[8].X,
4540; CM-NEXT:     LSHR T3.X, T1.Z, literal.x,
4541; CM-NEXT:     MOV T2.Y, KC0[8].Z,
4542; CM-NEXT:     MOV * T1.Z, KC0[7].W,
4543; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4544; CM-NEXT:     MOV T2.X, KC0[8].Y,
4545; CM-NEXT:     MOV * T1.Y, KC0[7].Z,
4546; CM-NEXT:     MOV T1.X, KC0[7].Y,
4547; CM-NEXT:     ADD_INT T3.Z, KC0[2].Y, literal.x,
4548; CM-NEXT:     MOV * T4.W, KC0[7].X,
4549; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
4550; CM-NEXT:     LSHR T5.X, PV.Z, literal.x,
4551; CM-NEXT:     MOV T4.Z, KC0[6].W,
4552; CM-NEXT:     ADD_INT * T3.W, KC0[2].Y, literal.y,
4553; CM-NEXT:    2(2.802597e-45), 16(2.242078e-44)
4554; CM-NEXT:     LSHR T6.X, PV.W, literal.x,
4555; CM-NEXT:     MOV * T4.Y, KC0[6].Z,
4556; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4557; CM-NEXT:     MOV * T4.X, KC0[6].Y,
4558; CM-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
4559; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4560entry:
4561  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
4562  ret void
4563}
4564
4565define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
4566; SI-LABEL: kernel_arg_i64:
4567; SI:       ; %bb.0:
4568; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4569; SI-NEXT:    s_mov_b32 s7, 0xf000
4570; SI-NEXT:    s_mov_b32 s6, -1
4571; SI-NEXT:    s_waitcnt lgkmcnt(0)
4572; SI-NEXT:    s_mov_b32 s4, s0
4573; SI-NEXT:    s_mov_b32 s5, s1
4574; SI-NEXT:    v_mov_b32_e32 v0, s2
4575; SI-NEXT:    v_mov_b32_e32 v1, s3
4576; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4577; SI-NEXT:    s_endpgm
4578;
4579; VI-LABEL: kernel_arg_i64:
4580; VI:       ; %bb.0:
4581; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
4582; VI-NEXT:    s_waitcnt lgkmcnt(0)
4583; VI-NEXT:    v_mov_b32_e32 v0, s0
4584; VI-NEXT:    v_mov_b32_e32 v1, s1
4585; VI-NEXT:    v_mov_b32_e32 v2, s2
4586; VI-NEXT:    v_mov_b32_e32 v3, s3
4587; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4588; VI-NEXT:    s_endpgm
4589;
4590; GFX9-LABEL: kernel_arg_i64:
4591; GFX9:       ; %bb.0:
4592; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4593; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4594; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4595; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4596; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4597; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4598; GFX9-NEXT:    s_endpgm
4599;
4600; EG-LABEL: kernel_arg_i64:
4601; EG:       ; %bb.0:
4602; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4603; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4604; EG-NEXT:    CF_END
4605; EG-NEXT:    PAD
4606; EG-NEXT:    ALU clause starting at 4:
4607; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4608; EG-NEXT:     MOV T0.X, KC0[2].W,
4609; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4610; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4611;
4612; CM-LABEL: kernel_arg_i64:
4613; CM:       ; %bb.0:
4614; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4615; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4616; CM-NEXT:    CF_END
4617; CM-NEXT:    PAD
4618; CM-NEXT:    ALU clause starting at 4:
4619; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4620; CM-NEXT:     MOV * T0.X, KC0[2].W,
4621; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4622; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4623  store i64 %a, i64 addrspace(1)* %out, align 8
4624  ret void
4625}
4626
4627define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
4628; SI-LABEL: f64_kernel_arg:
4629; SI:       ; %bb.0: ; %entry
4630; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
4631; SI-NEXT:    s_mov_b32 s7, 0xf000
4632; SI-NEXT:    s_mov_b32 s6, -1
4633; SI-NEXT:    s_waitcnt lgkmcnt(0)
4634; SI-NEXT:    s_mov_b32 s4, s0
4635; SI-NEXT:    s_mov_b32 s5, s1
4636; SI-NEXT:    v_mov_b32_e32 v0, s2
4637; SI-NEXT:    v_mov_b32_e32 v1, s3
4638; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4639; SI-NEXT:    s_endpgm
4640;
4641; VI-LABEL: f64_kernel_arg:
4642; VI:       ; %bb.0: ; %entry
4643; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
4644; VI-NEXT:    s_waitcnt lgkmcnt(0)
4645; VI-NEXT:    v_mov_b32_e32 v0, s0
4646; VI-NEXT:    v_mov_b32_e32 v1, s1
4647; VI-NEXT:    v_mov_b32_e32 v2, s2
4648; VI-NEXT:    v_mov_b32_e32 v3, s3
4649; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4650; VI-NEXT:    s_endpgm
4651;
4652; GFX9-LABEL: f64_kernel_arg:
4653; GFX9:       ; %bb.0: ; %entry
4654; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
4655; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4656; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4657; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4658; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4659; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4660; GFX9-NEXT:    s_endpgm
4661;
4662; EG-LABEL: f64_kernel_arg:
4663; EG:       ; %bb.0: ; %entry
4664; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4665; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
4666; EG-NEXT:    CF_END
4667; EG-NEXT:    PAD
4668; EG-NEXT:    ALU clause starting at 4:
4669; EG-NEXT:     MOV * T0.Y, KC0[3].X,
4670; EG-NEXT:     MOV T0.X, KC0[2].W,
4671; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4672; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4673;
4674; CM-LABEL: f64_kernel_arg:
4675; CM:       ; %bb.0: ; %entry
4676; CM-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
4677; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
4678; CM-NEXT:    CF_END
4679; CM-NEXT:    PAD
4680; CM-NEXT:    ALU clause starting at 4:
4681; CM-NEXT:     MOV * T0.Y, KC0[3].X,
4682; CM-NEXT:     MOV * T0.X, KC0[2].W,
4683; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4684; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4685entry:
4686  store double %in, double addrspace(1)* %out
4687  ret void
4688}
4689
4690; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
4691; XGCN: s_load_dwordx2
4692; XGCN: s_load_dwordx2
4693; XGCN: buffer_store_dwordx2
4694; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
4695;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
4696;   ret void
4697; }
4698
4699define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
4700; SI-LABEL: i65_arg:
4701; SI:       ; %bb.0: ; %entry
4702; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
4703; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
4704; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4705; SI-NEXT:    s_mov_b32 s3, 0xf000
4706; SI-NEXT:    s_waitcnt lgkmcnt(0)
4707; SI-NEXT:    s_and_b32 s6, s2, 1
4708; SI-NEXT:    s_mov_b32 s2, -1
4709; SI-NEXT:    v_mov_b32_e32 v0, s4
4710; SI-NEXT:    v_mov_b32_e32 v1, s5
4711; SI-NEXT:    v_mov_b32_e32 v2, s6
4712; SI-NEXT:    buffer_store_byte v2, off, s[0:3], 0 offset:8
4713; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4714; SI-NEXT:    s_endpgm
4715;
4716; VI-LABEL: i65_arg:
4717; VI:       ; %bb.0: ; %entry
4718; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4719; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
4720; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
4721; VI-NEXT:    s_waitcnt lgkmcnt(0)
4722; VI-NEXT:    v_mov_b32_e32 v0, s2
4723; VI-NEXT:    v_mov_b32_e32 v1, s3
4724; VI-NEXT:    s_and_b32 s1, s0, 1
4725; VI-NEXT:    s_add_u32 s0, s2, 8
4726; VI-NEXT:    v_mov_b32_e32 v4, s1
4727; VI-NEXT:    s_addc_u32 s1, s3, 0
4728; VI-NEXT:    v_mov_b32_e32 v3, s1
4729; VI-NEXT:    v_mov_b32_e32 v2, s0
4730; VI-NEXT:    flat_store_byte v[2:3], v4
4731; VI-NEXT:    v_mov_b32_e32 v2, s4
4732; VI-NEXT:    v_mov_b32_e32 v3, s5
4733; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
4734; VI-NEXT:    s_endpgm
4735;
4736; GFX9-LABEL: i65_arg:
4737; GFX9:       ; %bb.0: ; %entry
4738; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4739; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x8
4740; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x10
4741; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4742; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4743; GFX9-NEXT:    v_mov_b32_e32 v0, s2
4744; GFX9-NEXT:    s_and_b32 s4, s6, 1
4745; GFX9-NEXT:    v_mov_b32_e32 v3, s4
4746; GFX9-NEXT:    v_mov_b32_e32 v1, s3
4747; GFX9-NEXT:    global_store_byte v2, v3, s[0:1] offset:8
4748; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
4749; GFX9-NEXT:    s_endpgm
4750;
4751; EG-LABEL: i65_arg:
4752; EG:       ; %bb.0: ; %entry
4753; EG-NEXT:    ALU 20, @6, KC0[CB0:0-32], KC1[]
4754; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
4755; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0
4756; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
4757; EG-NEXT:    CF_END
4758; EG-NEXT:    PAD
4759; EG-NEXT:    ALU clause starting at 6:
4760; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4761; EG-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4762; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4763; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4764; EG-NEXT:     LSHL T1.W, PV.W, literal.x,
4765; EG-NEXT:     AND_INT * T2.W, KC0[3].Y, 1,
4766; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4767; EG-NEXT:     LSHL T1.X, PS, PV.W,
4768; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
4769; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4770; EG-NEXT:     MOV T1.Y, 0.0,
4771; EG-NEXT:     MOV * T1.Z, 0.0,
4772; EG-NEXT:     LSHR T0.X, T0.W, literal.x,
4773; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.y,
4774; EG-NEXT:    2(2.802597e-45), 4(5.605194e-45)
4775; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
4776; EG-NEXT:     MOV * T3.X, KC0[3].X,
4777; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4778; EG-NEXT:     LSHR T4.X, KC0[2].Y, literal.x,
4779; EG-NEXT:     MOV * T5.X, KC0[2].W,
4780; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4781;
4782; CM-LABEL: i65_arg:
4783; CM:       ; %bb.0: ; %entry
4784; CM-NEXT:    ALU 21, @6, KC0[CB0:0-32], KC1[]
4785; CM-NEXT:    MEM_RAT MSKOR T1.XW, T5.X
4786; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
4787; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
4788; CM-NEXT:    CF_END
4789; CM-NEXT:    PAD
4790; CM-NEXT:    ALU clause starting at 6:
4791; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
4792; CM-NEXT:    8(1.121039e-44), 0(0.000000e+00)
4793; CM-NEXT:     AND_INT * T1.W, PV.W, literal.x,
4794; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4795; CM-NEXT:     LSHL T0.Z, PV.W, literal.x,
4796; CM-NEXT:     AND_INT * T1.W, KC0[3].Y, 1,
4797; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4798; CM-NEXT:     LSHL T1.X, PV.W, PV.Z,
4799; CM-NEXT:     LSHL * T1.W, literal.x, PV.Z,
4800; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4801; CM-NEXT:     MOV T1.Y, 0.0,
4802; CM-NEXT:     MOV * T1.Z, 0.0,
4803; CM-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
4804; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4805; CM-NEXT:     MOV T2.X, KC0[2].W,
4806; CM-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
4807; CM-NEXT:    4(5.605194e-45), 0(0.000000e+00)
4808; CM-NEXT:     LSHR * T3.X, PV.W, literal.x,
4809; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4810; CM-NEXT:     MOV * T4.X, KC0[3].X,
4811; CM-NEXT:     LSHR * T5.X, T0.W, literal.x,
4812; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4813entry:
4814  store i65 %in, i65 addrspace(1)* %out, align 4
4815  ret void
4816}
4817
4818define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
4819; SI-LABEL: i1_arg:
4820; SI:       ; %bb.0:
4821; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
4822; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4823; SI-NEXT:    s_mov_b32 s3, 0xf000
4824; SI-NEXT:    s_waitcnt lgkmcnt(0)
4825; SI-NEXT:    s_and_b32 s4, s2, 1
4826; SI-NEXT:    s_mov_b32 s2, -1
4827; SI-NEXT:    v_mov_b32_e32 v0, s4
4828; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
4829; SI-NEXT:    s_endpgm
4830;
4831; VI-LABEL: i1_arg:
4832; VI:       ; %bb.0:
4833; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4834; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
4835; VI-NEXT:    s_waitcnt lgkmcnt(0)
4836; VI-NEXT:    v_mov_b32_e32 v0, s2
4837; VI-NEXT:    s_and_b32 s0, s0, 1
4838; VI-NEXT:    v_mov_b32_e32 v1, s3
4839; VI-NEXT:    v_mov_b32_e32 v2, s0
4840; VI-NEXT:    flat_store_byte v[0:1], v2
4841; VI-NEXT:    s_endpgm
4842;
4843; GFX9-LABEL: i1_arg:
4844; GFX9:       ; %bb.0:
4845; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4846; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
4847; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4848; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4849; GFX9-NEXT:    s_and_b32 s2, s2, 1
4850; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4851; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
4852; GFX9-NEXT:    s_endpgm
4853;
4854; EG-LABEL: i1_arg:
4855; EG:       ; %bb.0:
4856; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4857; EG-NEXT:    TEX 0 @6
4858; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4859; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4860; EG-NEXT:    CF_END
4861; EG-NEXT:    PAD
4862; EG-NEXT:    Fetch clause starting at 6:
4863; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4864; EG-NEXT:    ALU clause starting at 8:
4865; EG-NEXT:     MOV * T0.X, 0.0,
4866; EG-NEXT:    ALU clause starting at 9:
4867; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
4868; EG-NEXT:     AND_INT * T1.W, T0.X, 1,
4869; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4870; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
4871; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4872; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
4873; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
4874; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4875; EG-NEXT:     MOV T0.Y, 0.0,
4876; EG-NEXT:     MOV * T0.Z, 0.0,
4877; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4878; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4879;
4880; CM-LABEL: i1_arg:
4881; CM:       ; %bb.0:
4882; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4883; CM-NEXT:    TEX 0 @6
4884; CM-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
4885; CM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
4886; CM-NEXT:    CF_END
4887; CM-NEXT:    PAD
4888; CM-NEXT:    Fetch clause starting at 6:
4889; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4890; CM-NEXT:    ALU clause starting at 8:
4891; CM-NEXT:     MOV * T0.X, 0.0,
4892; CM-NEXT:    ALU clause starting at 9:
4893; CM-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
4894; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4895; CM-NEXT:     AND_INT T0.Z, T0.X, 1,
4896; CM-NEXT:     LSHL * T0.W, PV.W, literal.x,
4897; CM-NEXT:    3(4.203895e-45), 0(0.000000e+00)
4898; CM-NEXT:     LSHL T0.X, PV.Z, PV.W,
4899; CM-NEXT:     LSHL * T0.W, literal.x, PV.W,
4900; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
4901; CM-NEXT:     MOV T0.Y, 0.0,
4902; CM-NEXT:     MOV * T0.Z, 0.0,
4903; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4904; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4905  store i1 %x, i1 addrspace(1)* %out, align 1
4906  ret void
4907}
4908
4909define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
4910; SI-LABEL: i1_arg_zext_i32:
4911; SI:       ; %bb.0:
4912; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
4913; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4914; SI-NEXT:    s_mov_b32 s3, 0xf000
4915; SI-NEXT:    s_waitcnt lgkmcnt(0)
4916; SI-NEXT:    s_and_b32 s4, s2, 1
4917; SI-NEXT:    s_mov_b32 s2, -1
4918; SI-NEXT:    v_mov_b32_e32 v0, s4
4919; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
4920; SI-NEXT:    s_endpgm
4921;
4922; VI-LABEL: i1_arg_zext_i32:
4923; VI:       ; %bb.0:
4924; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4925; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
4926; VI-NEXT:    s_waitcnt lgkmcnt(0)
4927; VI-NEXT:    v_mov_b32_e32 v0, s2
4928; VI-NEXT:    s_and_b32 s0, s0, 1
4929; VI-NEXT:    v_mov_b32_e32 v1, s3
4930; VI-NEXT:    v_mov_b32_e32 v2, s0
4931; VI-NEXT:    flat_store_dword v[0:1], v2
4932; VI-NEXT:    s_endpgm
4933;
4934; GFX9-LABEL: i1_arg_zext_i32:
4935; GFX9:       ; %bb.0:
4936; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
4937; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
4938; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4939; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
4940; GFX9-NEXT:    s_and_b32 s2, s2, 1
4941; GFX9-NEXT:    v_mov_b32_e32 v1, s2
4942; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
4943; GFX9-NEXT:    s_endpgm
4944;
4945; EG-LABEL: i1_arg_zext_i32:
4946; EG:       ; %bb.0:
4947; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
4948; EG-NEXT:    TEX 0 @6
4949; EG-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4950; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
4951; EG-NEXT:    CF_END
4952; EG-NEXT:    PAD
4953; EG-NEXT:    Fetch clause starting at 6:
4954; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4955; EG-NEXT:    ALU clause starting at 8:
4956; EG-NEXT:     MOV * T0.X, 0.0,
4957; EG-NEXT:    ALU clause starting at 9:
4958; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4959; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4960;
4961; CM-LABEL: i1_arg_zext_i32:
4962; CM:       ; %bb.0:
4963; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
4964; CM-NEXT:    TEX 0 @6
4965; CM-NEXT:    ALU 1, @9, KC0[CB0:0-32], KC1[]
4966; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
4967; CM-NEXT:    CF_END
4968; CM-NEXT:    PAD
4969; CM-NEXT:    Fetch clause starting at 6:
4970; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
4971; CM-NEXT:    ALU clause starting at 8:
4972; CM-NEXT:     MOV * T0.X, 0.0,
4973; CM-NEXT:    ALU clause starting at 9:
4974; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
4975; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
4976  %ext = zext i1 %x to i32
4977  store i32 %ext, i32 addrspace(1)* %out, align 4
4978  ret void
4979}
4980
4981define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
4982; SI-LABEL: i1_arg_zext_i64:
4983; SI:       ; %bb.0:
4984; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
4985; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
4986; SI-NEXT:    s_mov_b32 s3, 0xf000
4987; SI-NEXT:    s_mov_b32 s2, -1
4988; SI-NEXT:    s_waitcnt lgkmcnt(0)
4989; SI-NEXT:    s_and_b32 s4, s4, 1
4990; SI-NEXT:    v_mov_b32_e32 v1, 0
4991; SI-NEXT:    v_mov_b32_e32 v0, s4
4992; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4993; SI-NEXT:    s_endpgm
4994;
4995; VI-LABEL: i1_arg_zext_i64:
4996; VI:       ; %bb.0:
4997; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
4998; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
4999; VI-NEXT:    v_mov_b32_e32 v1, 0
5000; VI-NEXT:    s_waitcnt lgkmcnt(0)
5001; VI-NEXT:    v_mov_b32_e32 v2, s2
5002; VI-NEXT:    s_and_b32 s0, s0, 1
5003; VI-NEXT:    v_mov_b32_e32 v0, s0
5004; VI-NEXT:    v_mov_b32_e32 v3, s3
5005; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5006; VI-NEXT:    s_endpgm
5007;
5008; GFX9-LABEL: i1_arg_zext_i64:
5009; GFX9:       ; %bb.0:
5010; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5011; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5012; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5013; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5014; GFX9-NEXT:    s_and_b32 s2, s2, 1
5015; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5016; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
5017; GFX9-NEXT:    s_endpgm
5018;
5019; EG-LABEL: i1_arg_zext_i64:
5020; EG:       ; %bb.0:
5021; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5022; EG-NEXT:    TEX 0 @6
5023; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5024; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5025; EG-NEXT:    CF_END
5026; EG-NEXT:    PAD
5027; EG-NEXT:    Fetch clause starting at 6:
5028; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5029; EG-NEXT:    ALU clause starting at 8:
5030; EG-NEXT:     MOV * T0.X, 0.0,
5031; EG-NEXT:    ALU clause starting at 9:
5032; EG-NEXT:     MOV * T0.Y, 0.0,
5033; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5034; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5035;
5036; CM-LABEL: i1_arg_zext_i64:
5037; CM:       ; %bb.0:
5038; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5039; CM-NEXT:    TEX 0 @6
5040; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5041; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5042; CM-NEXT:    CF_END
5043; CM-NEXT:    PAD
5044; CM-NEXT:    Fetch clause starting at 6:
5045; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5046; CM-NEXT:    ALU clause starting at 8:
5047; CM-NEXT:     MOV * T0.X, 0.0,
5048; CM-NEXT:    ALU clause starting at 9:
5049; CM-NEXT:     MOV * T0.Y, 0.0,
5050; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5051; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5052  %ext = zext i1 %x to i64
5053  store i64 %ext, i64 addrspace(1)* %out, align 8
5054  ret void
5055}
5056
5057define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
5058; SI-LABEL: i1_arg_sext_i32:
5059; SI:       ; %bb.0:
5060; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
5061; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5062; SI-NEXT:    s_mov_b32 s3, 0xf000
5063; SI-NEXT:    s_waitcnt lgkmcnt(0)
5064; SI-NEXT:    s_bfe_i32 s4, s2, 0x10000
5065; SI-NEXT:    s_mov_b32 s2, -1
5066; SI-NEXT:    v_mov_b32_e32 v0, s4
5067; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5068; SI-NEXT:    s_endpgm
5069;
5070; VI-LABEL: i1_arg_sext_i32:
5071; VI:       ; %bb.0:
5072; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5073; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
5074; VI-NEXT:    s_waitcnt lgkmcnt(0)
5075; VI-NEXT:    v_mov_b32_e32 v0, s2
5076; VI-NEXT:    s_bfe_i32 s0, s0, 0x10000
5077; VI-NEXT:    v_mov_b32_e32 v1, s3
5078; VI-NEXT:    v_mov_b32_e32 v2, s0
5079; VI-NEXT:    flat_store_dword v[0:1], v2
5080; VI-NEXT:    s_endpgm
5081;
5082; GFX9-LABEL: i1_arg_sext_i32:
5083; GFX9:       ; %bb.0:
5084; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5085; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5086; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5087; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5088; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x10000
5089; GFX9-NEXT:    v_mov_b32_e32 v1, s2
5090; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
5091; GFX9-NEXT:    s_endpgm
5092;
5093; EG-LABEL: i1_arg_sext_i32:
5094; EG:       ; %bb.0:
5095; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5096; EG-NEXT:    TEX 0 @6
5097; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5098; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
5099; EG-NEXT:    CF_END
5100; EG-NEXT:    PAD
5101; EG-NEXT:    Fetch clause starting at 6:
5102; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5103; EG-NEXT:    ALU clause starting at 8:
5104; EG-NEXT:     MOV * T0.X, 0.0,
5105; EG-NEXT:    ALU clause starting at 9:
5106; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
5107; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5108; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5109;
5110; CM-LABEL: i1_arg_sext_i32:
5111; CM:       ; %bb.0:
5112; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5113; CM-NEXT:    TEX 0 @6
5114; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
5115; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
5116; CM-NEXT:    CF_END
5117; CM-NEXT:    PAD
5118; CM-NEXT:    Fetch clause starting at 6:
5119; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5120; CM-NEXT:    ALU clause starting at 8:
5121; CM-NEXT:     MOV * T0.X, 0.0,
5122; CM-NEXT:    ALU clause starting at 9:
5123; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
5124; CM-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5125; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5126  %ext = sext i1 %x to i32
5127  store i32 %ext, i32addrspace(1)* %out, align 4
5128  ret void
5129}
5130
5131define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
5132; SI-LABEL: i1_arg_sext_i64:
5133; SI:       ; %bb.0:
5134; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
5135; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
5136; SI-NEXT:    s_mov_b32 s3, 0xf000
5137; SI-NEXT:    s_waitcnt lgkmcnt(0)
5138; SI-NEXT:    s_bfe_i64 s[4:5], s[2:3], 0x10000
5139; SI-NEXT:    s_mov_b32 s2, -1
5140; SI-NEXT:    v_mov_b32_e32 v0, s4
5141; SI-NEXT:    v_mov_b32_e32 v1, s5
5142; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5143; SI-NEXT:    s_endpgm
5144;
5145; VI-LABEL: i1_arg_sext_i64:
5146; VI:       ; %bb.0:
5147; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
5148; VI-NEXT:    s_load_dword s0, s[0:1], 0x2c
5149; VI-NEXT:    s_waitcnt lgkmcnt(0)
5150; VI-NEXT:    v_mov_b32_e32 v0, s2
5151; VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x10000
5152; VI-NEXT:    v_mov_b32_e32 v3, s1
5153; VI-NEXT:    v_mov_b32_e32 v1, s3
5154; VI-NEXT:    v_mov_b32_e32 v2, s0
5155; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5156; VI-NEXT:    s_endpgm
5157;
5158; GFX9-LABEL: i1_arg_sext_i64:
5159; GFX9:       ; %bb.0:
5160; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
5161; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5162; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5163; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5164; GFX9-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
5165; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5166; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5167; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
5168; GFX9-NEXT:    s_endpgm
5169;
5170; EG-LABEL: i1_arg_sext_i64:
5171; EG:       ; %bb.0:
5172; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
5173; EG-NEXT:    TEX 0 @6
5174; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
5175; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
5176; EG-NEXT:    CF_END
5177; EG-NEXT:    PAD
5178; EG-NEXT:    Fetch clause starting at 6:
5179; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5180; EG-NEXT:    ALU clause starting at 8:
5181; EG-NEXT:     MOV * T0.X, 0.0,
5182; EG-NEXT:    ALU clause starting at 9:
5183; EG-NEXT:     BFE_INT T0.X, T0.X, 0.0, 1,
5184; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
5185; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5186; EG-NEXT:     MOV * T0.Y, PV.X,
5187;
5188; CM-LABEL: i1_arg_sext_i64:
5189; CM:       ; %bb.0:
5190; CM-NEXT:    ALU 0, @8, KC0[], KC1[]
5191; CM-NEXT:    TEX 0 @6
5192; CM-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
5193; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
5194; CM-NEXT:    CF_END
5195; CM-NEXT:    PAD
5196; CM-NEXT:    Fetch clause starting at 6:
5197; CM-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
5198; CM-NEXT:    ALU clause starting at 8:
5199; CM-NEXT:     MOV * T0.X, 0.0,
5200; CM-NEXT:    ALU clause starting at 9:
5201; CM-NEXT:     BFE_INT * T0.X, T0.X, 0.0, 1,
5202; CM-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
5203; CM-NEXT:     MOV * T0.Y, PV.X,
5204; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
5205  %ext = sext i1 %x to i64
5206  store i64 %ext, i64 addrspace(1)* %out, align 8
5207  ret void
5208}
5209
5210define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
5211; SI-LABEL: empty_struct_arg:
5212; SI:       ; %bb.0:
5213; SI-NEXT:    s_endpgm
5214;
5215; VI-LABEL: empty_struct_arg:
5216; VI:       ; %bb.0:
5217; VI-NEXT:    s_endpgm
5218;
5219; GFX9-LABEL: empty_struct_arg:
5220; GFX9:       ; %bb.0:
5221; GFX9-NEXT:    s_endpgm
5222;
5223; EGCM-LABEL: empty_struct_arg:
5224; EGCM:       ; %bb.0:
5225; EGCM-NEXT:    CF_END
5226; EGCM-NEXT:    PAD
5227  ret void
5228}
5229
5230; The correct load offsets for these:
5231; load 4 from 0,
5232; load 8 from 8
5233; load 4 from 24
5234; load 8 from 32
5235
5236; With the SelectionDAG argument lowering, the alignments for the
5237; struct members is not properly considered, making these wrong.
5238
5239; FIXME: Total argument size is computed wrong
5240define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
5241; SI-LABEL: struct_argument_alignment:
5242; SI:       ; %bb.0:
5243; SI-NEXT:    s_load_dword s8, s[0:1], 0x9
5244; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xb
5245; SI-NEXT:    s_load_dword s9, s[0:1], 0xf
5246; SI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x11
5247; SI-NEXT:    s_mov_b32 s0, 0
5248; SI-NEXT:    s_mov_b32 s3, 0xf000
5249; SI-NEXT:    s_mov_b32 s2, -1
5250; SI-NEXT:    s_mov_b32 s1, s0
5251; SI-NEXT:    s_waitcnt lgkmcnt(0)
5252; SI-NEXT:    v_mov_b32_e32 v0, s8
5253; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5254; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5255; SI-NEXT:    v_mov_b32_e32 v0, s4
5256; SI-NEXT:    v_mov_b32_e32 v1, s5
5257; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5258; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5259; SI-NEXT:    v_mov_b32_e32 v0, s9
5260; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5261; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5262; SI-NEXT:    v_mov_b32_e32 v0, s6
5263; SI-NEXT:    v_mov_b32_e32 v1, s7
5264; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5265; SI-NEXT:    s_waitcnt vmcnt(0)
5266; SI-NEXT:    s_endpgm
5267;
5268; VI-LABEL: struct_argument_alignment:
5269; VI:       ; %bb.0:
5270; VI-NEXT:    s_load_dword s4, s[0:1], 0x24
5271; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
5272; VI-NEXT:    s_load_dword s5, s[0:1], 0x3c
5273; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x44
5274; VI-NEXT:    v_mov_b32_e32 v0, 0
5275; VI-NEXT:    v_mov_b32_e32 v1, 0
5276; VI-NEXT:    s_waitcnt lgkmcnt(0)
5277; VI-NEXT:    v_mov_b32_e32 v2, s4
5278; VI-NEXT:    flat_store_dword v[0:1], v2
5279; VI-NEXT:    s_waitcnt vmcnt(0)
5280; VI-NEXT:    v_mov_b32_e32 v2, s2
5281; VI-NEXT:    v_mov_b32_e32 v3, s3
5282; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5283; VI-NEXT:    s_waitcnt vmcnt(0)
5284; VI-NEXT:    v_mov_b32_e32 v2, s5
5285; VI-NEXT:    flat_store_dword v[0:1], v2
5286; VI-NEXT:    s_waitcnt vmcnt(0)
5287; VI-NEXT:    v_mov_b32_e32 v3, s1
5288; VI-NEXT:    v_mov_b32_e32 v2, s0
5289; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
5290; VI-NEXT:    s_waitcnt vmcnt(0)
5291; VI-NEXT:    s_endpgm
5292;
5293; GFX9-LABEL: struct_argument_alignment:
5294; GFX9:       ; %bb.0:
5295; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
5296; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
5297; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x18
5298; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x20
5299; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5300; GFX9-NEXT:    v_mov_b32_e32 v1, 0
5301; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5302; GFX9-NEXT:    v_mov_b32_e32 v2, s6
5303; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5304; GFX9-NEXT:    s_waitcnt vmcnt(0)
5305; GFX9-NEXT:    v_mov_b32_e32 v3, s1
5306; GFX9-NEXT:    v_mov_b32_e32 v2, s0
5307; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5308; GFX9-NEXT:    s_waitcnt vmcnt(0)
5309; GFX9-NEXT:    v_mov_b32_e32 v2, s7
5310; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5311; GFX9-NEXT:    s_waitcnt vmcnt(0)
5312; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5313; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5314; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
5315; GFX9-NEXT:    s_waitcnt vmcnt(0)
5316; GFX9-NEXT:    s_endpgm
5317;
5318; EG-LABEL: struct_argument_alignment:
5319; EG:       ; %bb.0:
5320; EG-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5321; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T7.X, T6.X, 0
5322; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0
5323; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T6.X, 0
5324; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T6.X, 0
5325; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 0
5326; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
5327; EG-NEXT:    CF_END
5328; EG-NEXT:    ALU clause starting at 8:
5329; EG-NEXT:     MOV T0.X, KC0[4].Y,
5330; EG-NEXT:     MOV * T1.X, KC0[4].Z,
5331; EG-NEXT:     MOV T2.X, KC0[3].W,
5332; EG-NEXT:     MOV * T3.X, KC0[2].W,
5333; EG-NEXT:     MOV T4.X, literal.x,
5334; EG-NEXT:     MOV * T5.X, KC0[3].X,
5335; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5336; EG-NEXT:     MOV T6.X, literal.x,
5337; EG-NEXT:     MOV * T7.X, KC0[2].Y,
5338; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5339;
5340; CM-LABEL: struct_argument_alignment:
5341; CM:       ; %bb.0:
5342; CM-NEXT:    ALU 9, @8, KC0[CB0:0-32], KC1[]
5343; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T7.X, T6.X
5344; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T5.X, T4.X
5345; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T6.X
5346; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T6.X
5347; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5348; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T6.X
5349; CM-NEXT:    CF_END
5350; CM-NEXT:    ALU clause starting at 8:
5351; CM-NEXT:     MOV * T0.X, KC0[4].Y,
5352; CM-NEXT:     MOV * T1.X, KC0[4].Z,
5353; CM-NEXT:     MOV * T2.X, KC0[3].W,
5354; CM-NEXT:     MOV * T3.X, KC0[2].W,
5355; CM-NEXT:     MOV * T4.X, literal.x,
5356; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5357; CM-NEXT:     MOV * T5.X, KC0[3].X,
5358; CM-NEXT:     MOV * T6.X, literal.x,
5359; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5360; CM-NEXT:     MOV * T7.X, KC0[2].Y,
5361  %val0 = extractvalue {i32, i64} %arg0, 0
5362  %val1 = extractvalue {i32, i64} %arg0, 1
5363  %val2 = extractvalue {i32, i64} %arg1, 0
5364  %val3 = extractvalue {i32, i64} %arg1, 1
5365  store volatile i32 %val0, i32 addrspace(1)* null
5366  store volatile i64 %val1, i64 addrspace(1)* null
5367  store volatile i32 %val2, i32 addrspace(1)* null
5368  store volatile i64 %val3, i64 addrspace(1)* null
5369  ret void
5370}
5371
5372; No padding between i8 and next struct, but round up at end to 4 byte
5373; multiple.
5374define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
5375; SI-LABEL: packed_struct_argument_alignment:
5376; SI:       ; %bb.0:
5377; SI-NEXT:    s_mov_b32 s3, 0xf000
5378; SI-NEXT:    s_mov_b32 s2, -1
5379; SI-NEXT:    s_load_dword s6, s[0:1], 0x9
5380; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xa
5381; SI-NEXT:    buffer_load_ubyte v4, off, s[0:3], 0 offset:49
5382; SI-NEXT:    buffer_load_ubyte v5, off, s[0:3], 0 offset:50
5383; SI-NEXT:    buffer_load_ubyte v6, off, s[0:3], 0 offset:51
5384; SI-NEXT:    buffer_load_ubyte v7, off, s[0:3], 0 offset:52
5385; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:53
5386; SI-NEXT:    s_mov_b32 s0, 0
5387; SI-NEXT:    s_mov_b32 s1, s0
5388; SI-NEXT:    s_waitcnt lgkmcnt(0)
5389; SI-NEXT:    v_mov_b32_e32 v2, s6
5390; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
5391; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5392; SI-NEXT:    v_mov_b32_e32 v2, s4
5393; SI-NEXT:    v_mov_b32_e32 v3, s5
5394; SI-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
5395; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5396; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
5397; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v7
5398; SI-NEXT:    v_or_b32_e32 v2, v2, v4
5399; SI-NEXT:    v_or_b32_e32 v3, v3, v6
5400; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
5401; SI-NEXT:    v_or_b32_e32 v2, v3, v2
5402; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
5403; SI-NEXT:    s_waitcnt vmcnt(0)
5404; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5405; SI-NEXT:    s_waitcnt vmcnt(0)
5406; SI-NEXT:    s_endpgm
5407;
5408; VI-LABEL: packed_struct_argument_alignment:
5409; VI:       ; %bb.0:
5410; VI-NEXT:    s_add_u32 s2, s0, 49
5411; VI-NEXT:    s_addc_u32 s3, s1, 0
5412; VI-NEXT:    s_add_u32 s4, s0, 50
5413; VI-NEXT:    s_addc_u32 s5, s1, 0
5414; VI-NEXT:    v_mov_b32_e32 v2, s2
5415; VI-NEXT:    v_mov_b32_e32 v3, s3
5416; VI-NEXT:    s_add_u32 s2, s2, 3
5417; VI-NEXT:    s_addc_u32 s3, s3, 0
5418; VI-NEXT:    v_mov_b32_e32 v5, s3
5419; VI-NEXT:    v_mov_b32_e32 v4, s2
5420; VI-NEXT:    s_add_u32 s2, s0, 51
5421; VI-NEXT:    s_addc_u32 s3, s1, 0
5422; VI-NEXT:    v_mov_b32_e32 v0, s4
5423; VI-NEXT:    v_mov_b32_e32 v7, s3
5424; VI-NEXT:    v_mov_b32_e32 v1, s5
5425; VI-NEXT:    v_mov_b32_e32 v6, s2
5426; VI-NEXT:    s_load_dword s4, s[0:1], 0x24
5427; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
5428; VI-NEXT:    flat_load_ubyte v8, v[0:1]
5429; VI-NEXT:    flat_load_ubyte v9, v[2:3]
5430; VI-NEXT:    flat_load_ubyte v10, v[4:5]
5431; VI-NEXT:    flat_load_ubyte v6, v[6:7]
5432; VI-NEXT:    s_add_u32 s0, s0, 53
5433; VI-NEXT:    s_addc_u32 s1, s1, 0
5434; VI-NEXT:    v_mov_b32_e32 v0, s0
5435; VI-NEXT:    v_mov_b32_e32 v1, s1
5436; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
5437; VI-NEXT:    v_mov_b32_e32 v2, 0
5438; VI-NEXT:    s_waitcnt lgkmcnt(0)
5439; VI-NEXT:    v_mov_b32_e32 v5, s3
5440; VI-NEXT:    v_mov_b32_e32 v3, 0
5441; VI-NEXT:    v_mov_b32_e32 v7, s4
5442; VI-NEXT:    v_mov_b32_e32 v4, s2
5443; VI-NEXT:    flat_store_dword v[2:3], v7
5444; VI-NEXT:    s_waitcnt vmcnt(0)
5445; VI-NEXT:    flat_store_dwordx2 v[2:3], v[4:5]
5446; VI-NEXT:    s_waitcnt vmcnt(0)
5447; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v8
5448; VI-NEXT:    v_or_b32_e32 v4, v4, v9
5449; VI-NEXT:    v_lshlrev_b32_e32 v5, 8, v10
5450; VI-NEXT:    v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5451; VI-NEXT:    v_or_b32_e32 v4, v5, v4
5452; VI-NEXT:    flat_store_dword v[2:3], v4
5453; VI-NEXT:    s_waitcnt vmcnt(0)
5454; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
5455; VI-NEXT:    s_waitcnt vmcnt(0)
5456; VI-NEXT:    s_endpgm
5457;
5458; GFX9-LABEL: packed_struct_argument_alignment:
5459; GFX9:       ; %bb.0:
5460; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5461; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x0
5462; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4
5463; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[4:5] offset:17
5464; GFX9-NEXT:    global_load_dword v6, v2, s[4:5] offset:13
5465; GFX9-NEXT:    v_mov_b32_e32 v2, 0
5466; GFX9-NEXT:    v_mov_b32_e32 v3, 0
5467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5468; GFX9-NEXT:    v_mov_b32_e32 v7, s2
5469; GFX9-NEXT:    v_mov_b32_e32 v5, s1
5470; GFX9-NEXT:    v_mov_b32_e32 v4, s0
5471; GFX9-NEXT:    global_store_dword v[2:3], v7, off
5472; GFX9-NEXT:    s_waitcnt vmcnt(0)
5473; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[4:5], off
5474; GFX9-NEXT:    s_waitcnt vmcnt(0)
5475; GFX9-NEXT:    global_store_dword v[2:3], v6, off
5476; GFX9-NEXT:    s_waitcnt vmcnt(0)
5477; GFX9-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
5478; GFX9-NEXT:    s_waitcnt vmcnt(0)
5479; GFX9-NEXT:    s_endpgm
5480;
5481; EG-LABEL: packed_struct_argument_alignment:
5482; EG:       ; %bb.0:
5483; EG-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5484; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0
5485; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5486; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5487; EG-NEXT:    ALU 2, @25, KC0[], KC1[]
5488; EG-NEXT:    TEX 0 @12
5489; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
5490; EG-NEXT:    TEX 0 @14
5491; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T1.X, 0
5492; EG-NEXT:    TEX 0 @16
5493; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 1
5494; EG-NEXT:    CF_END
5495; EG-NEXT:    Fetch clause starting at 12:
5496; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5497; EG-NEXT:    Fetch clause starting at 14:
5498; EG-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5499; EG-NEXT:    Fetch clause starting at 16:
5500; EG-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5501; EG-NEXT:    ALU clause starting at 18:
5502; EG-NEXT:     MOV T0.X, KC0[2].Z,
5503; EG-NEXT:     MOV * T1.X, literal.x,
5504; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5505; EG-NEXT:     MOV T2.X, KC0[2].W,
5506; EG-NEXT:     MOV * T3.X, literal.x,
5507; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5508; EG-NEXT:     MOV * T4.X, KC0[2].Y,
5509; EG-NEXT:    ALU clause starting at 25:
5510; EG-NEXT:     MOV T0.X, 0.0,
5511; EG-NEXT:     MOV * T2.X, 0.0,
5512; EG-NEXT:     MOV * T4.X, 0.0,
5513;
5514; CM-LABEL: packed_struct_argument_alignment:
5515; CM:       ; %bb.0:
5516; CM-NEXT:    ALU 6, @18, KC0[CB0:0-32], KC1[]
5517; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5518; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5519; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5520; CM-NEXT:    ALU 2, @25, KC0[], KC1[]
5521; CM-NEXT:    TEX 0 @12
5522; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T3.X
5523; CM-NEXT:    TEX 0 @14
5524; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T1.X
5525; CM-NEXT:    TEX 0 @16
5526; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T3.X
5527; CM-NEXT:    CF_END
5528; CM-NEXT:    Fetch clause starting at 12:
5529; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 49, #3
5530; CM-NEXT:    Fetch clause starting at 14:
5531; CM-NEXT:     VTX_READ_32 T2.X, T2.X, 57, #3
5532; CM-NEXT:    Fetch clause starting at 16:
5533; CM-NEXT:     VTX_READ_32 T4.X, T4.X, 53, #3
5534; CM-NEXT:    ALU clause starting at 18:
5535; CM-NEXT:     MOV * T0.X, KC0[2].Z,
5536; CM-NEXT:     MOV * T1.X, literal.x,
5537; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5538; CM-NEXT:     MOV * T2.X, KC0[2].W,
5539; CM-NEXT:     MOV * T3.X, literal.x,
5540; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5541; CM-NEXT:     MOV * T4.X, KC0[2].Y,
5542; CM-NEXT:    ALU clause starting at 25:
5543; CM-NEXT:     MOV * T0.X, 0.0,
5544; CM-NEXT:     MOV * T2.X, 0.0,
5545; CM-NEXT:     MOV * T4.X, 0.0,
5546  %val0 = extractvalue <{i32, i64}> %arg0, 0
5547  %val1 = extractvalue <{i32, i64}> %arg0, 1
5548  %val2 = extractvalue <{i32, i64}> %arg1, 0
5549  %val3 = extractvalue <{i32, i64}> %arg1, 1
5550  store volatile i32 %val0, i32 addrspace(1)* null
5551  store volatile i64 %val1, i64 addrspace(1)* null
5552  store volatile i32 %val2, i32 addrspace(1)* null
5553  store volatile i64 %val3, i64 addrspace(1)* null
5554  ret void
5555}
5556
5557define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
5558; SI-LABEL: struct_argument_alignment_after:
5559; SI:       ; %bb.0:
5560; SI-NEXT:    s_load_dword s12, s[0:1], 0x9
5561; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xb
5562; SI-NEXT:    s_load_dword s13, s[0:1], 0xf
5563; SI-NEXT:    s_load_dwordx2 s[10:11], s[0:1], 0x11
5564; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x15
5565; SI-NEXT:    s_mov_b32 s4, 0
5566; SI-NEXT:    s_mov_b32 s7, 0xf000
5567; SI-NEXT:    s_mov_b32 s6, -1
5568; SI-NEXT:    s_mov_b32 s5, s4
5569; SI-NEXT:    s_waitcnt lgkmcnt(0)
5570; SI-NEXT:    v_mov_b32_e32 v0, s12
5571; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5572; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5573; SI-NEXT:    v_mov_b32_e32 v0, s8
5574; SI-NEXT:    v_mov_b32_e32 v1, s9
5575; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5576; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5577; SI-NEXT:    v_mov_b32_e32 v0, s13
5578; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
5579; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5580; SI-NEXT:    v_mov_b32_e32 v0, s10
5581; SI-NEXT:    v_mov_b32_e32 v1, s11
5582; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5583; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5584; SI-NEXT:    v_mov_b32_e32 v0, s0
5585; SI-NEXT:    v_mov_b32_e32 v1, s1
5586; SI-NEXT:    v_mov_b32_e32 v2, s2
5587; SI-NEXT:    v_mov_b32_e32 v3, s3
5588; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
5589; SI-NEXT:    s_waitcnt vmcnt(0)
5590; SI-NEXT:    s_endpgm
5591;
5592; VI-LABEL: struct_argument_alignment_after:
5593; VI:       ; %bb.0:
5594; VI-NEXT:    s_load_dword s8, s[0:1], 0x24
5595; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x2c
5596; VI-NEXT:    s_load_dword s9, s[0:1], 0x3c
5597; VI-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x44
5598; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x54
5599; VI-NEXT:    v_mov_b32_e32 v4, 0
5600; VI-NEXT:    v_mov_b32_e32 v5, 0
5601; VI-NEXT:    s_waitcnt lgkmcnt(0)
5602; VI-NEXT:    v_mov_b32_e32 v0, s8
5603; VI-NEXT:    flat_store_dword v[4:5], v0
5604; VI-NEXT:    s_waitcnt vmcnt(0)
5605; VI-NEXT:    v_mov_b32_e32 v0, s4
5606; VI-NEXT:    v_mov_b32_e32 v1, s5
5607; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5608; VI-NEXT:    s_waitcnt vmcnt(0)
5609; VI-NEXT:    v_mov_b32_e32 v0, s9
5610; VI-NEXT:    flat_store_dword v[4:5], v0
5611; VI-NEXT:    s_waitcnt vmcnt(0)
5612; VI-NEXT:    v_mov_b32_e32 v0, s6
5613; VI-NEXT:    v_mov_b32_e32 v1, s7
5614; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
5615; VI-NEXT:    s_waitcnt vmcnt(0)
5616; VI-NEXT:    v_mov_b32_e32 v0, s0
5617; VI-NEXT:    v_mov_b32_e32 v1, s1
5618; VI-NEXT:    v_mov_b32_e32 v2, s2
5619; VI-NEXT:    v_mov_b32_e32 v3, s3
5620; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
5621; VI-NEXT:    s_waitcnt vmcnt(0)
5622; VI-NEXT:    s_endpgm
5623;
5624; GFX9-LABEL: struct_argument_alignment_after:
5625; GFX9:       ; %bb.0:
5626; GFX9-NEXT:    s_load_dword s10, s[4:5], 0x0
5627; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
5628; GFX9-NEXT:    s_load_dword s11, s[4:5], 0x18
5629; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x20
5630; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x30
5631; GFX9-NEXT:    v_mov_b32_e32 v4, 0
5632; GFX9-NEXT:    v_mov_b32_e32 v5, 0
5633; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5634; GFX9-NEXT:    v_mov_b32_e32 v0, s10
5635; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5636; GFX9-NEXT:    s_waitcnt vmcnt(0)
5637; GFX9-NEXT:    v_mov_b32_e32 v0, s6
5638; GFX9-NEXT:    v_mov_b32_e32 v1, s7
5639; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5640; GFX9-NEXT:    s_waitcnt vmcnt(0)
5641; GFX9-NEXT:    v_mov_b32_e32 v0, s11
5642; GFX9-NEXT:    global_store_dword v[4:5], v0, off
5643; GFX9-NEXT:    s_waitcnt vmcnt(0)
5644; GFX9-NEXT:    v_mov_b32_e32 v0, s8
5645; GFX9-NEXT:    v_mov_b32_e32 v1, s9
5646; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
5647; GFX9-NEXT:    s_waitcnt vmcnt(0)
5648; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5649; GFX9-NEXT:    v_mov_b32_e32 v1, s1
5650; GFX9-NEXT:    v_mov_b32_e32 v2, s2
5651; GFX9-NEXT:    v_mov_b32_e32 v3, s3
5652; GFX9-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
5653; GFX9-NEXT:    s_waitcnt vmcnt(0)
5654; GFX9-NEXT:    s_endpgm
5655;
5656; EG-LABEL: struct_argument_alignment_after:
5657; EG:       ; %bb.0:
5658; EG-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5659; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.X, T7.X, 0
5660; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T5.X, 0
5661; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
5662; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T7.X, 0
5663; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T5.X, 0
5664; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T7.X, 0
5665; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T7.X, 1
5666; EG-NEXT:    CF_END
5667; EG-NEXT:    PAD
5668; EG-NEXT:    ALU clause starting at 10:
5669; EG-NEXT:     MOV * T0.W, KC0[6].X,
5670; EG-NEXT:     MOV * T0.Z, KC0[5].W,
5671; EG-NEXT:     MOV * T0.Y, KC0[5].Z,
5672; EG-NEXT:     MOV T0.X, KC0[5].Y,
5673; EG-NEXT:     MOV * T1.X, KC0[4].Y,
5674; EG-NEXT:     MOV T2.X, KC0[4].Z,
5675; EG-NEXT:     MOV * T3.X, KC0[3].W,
5676; EG-NEXT:     MOV T4.X, KC0[2].W,
5677; EG-NEXT:     MOV * T5.X, literal.x,
5678; EG-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5679; EG-NEXT:     MOV T6.X, KC0[3].X,
5680; EG-NEXT:     MOV * T7.X, literal.x,
5681; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5682; EG-NEXT:     MOV * T8.X, KC0[2].Y,
5683;
5684; CM-LABEL: struct_argument_alignment_after:
5685; CM:       ; %bb.0:
5686; CM-NEXT:    ALU 13, @10, KC0[CB0:0-32], KC1[]
5687; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T8.X, T7.X
5688; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T6.X, T5.X
5689; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T4.X, T7.X
5690; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T7.X
5691; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T5.X
5692; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T7.X
5693; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T7.X
5694; CM-NEXT:    CF_END
5695; CM-NEXT:    PAD
5696; CM-NEXT:    ALU clause starting at 10:
5697; CM-NEXT:     MOV * T0.W, KC0[6].X,
5698; CM-NEXT:     MOV * T0.Z, KC0[5].W,
5699; CM-NEXT:     MOV * T0.Y, KC0[5].Z,
5700; CM-NEXT:     MOV * T0.X, KC0[5].Y,
5701; CM-NEXT:     MOV * T1.X, KC0[4].Y,
5702; CM-NEXT:     MOV * T2.X, KC0[4].Z,
5703; CM-NEXT:     MOV * T3.X, KC0[3].W,
5704; CM-NEXT:     MOV * T4.X, KC0[2].W,
5705; CM-NEXT:     MOV * T5.X, literal.x,
5706; CM-NEXT:    1(1.401298e-45), 0(0.000000e+00)
5707; CM-NEXT:     MOV * T6.X, KC0[3].X,
5708; CM-NEXT:     MOV * T7.X, literal.x,
5709; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5710; CM-NEXT:     MOV * T8.X, KC0[2].Y,
5711  %val0 = extractvalue {i32, i64} %arg0, 0
5712  %val1 = extractvalue {i32, i64} %arg0, 1
5713  %val2 = extractvalue {i32, i64} %arg2, 0
5714  %val3 = extractvalue {i32, i64} %arg2, 1
5715  store volatile i32 %val0, i32 addrspace(1)* null
5716  store volatile i64 %val1, i64 addrspace(1)* null
5717  store volatile i32 %val2, i32 addrspace(1)* null
5718  store volatile i64 %val3, i64 addrspace(1)* null
5719  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
5720  ret void
5721}
5722
5723define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
5724; SI-LABEL: array_3xi32:
5725; SI:       ; %bb.0:
5726; SI-NEXT:    s_load_dword s4, s[0:1], 0xc
5727; SI-NEXT:    s_load_dword s5, s[0:1], 0x9
5728; SI-NEXT:    s_load_dword s6, s[0:1], 0xa
5729; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
5730; SI-NEXT:    s_mov_b32 s3, 0xf000
5731; SI-NEXT:    s_mov_b32 s2, -1
5732; SI-NEXT:    s_waitcnt lgkmcnt(0)
5733; SI-NEXT:    v_mov_b32_e32 v0, s5
5734; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
5735; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5736; SI-NEXT:    v_mov_b32_e32 v0, s4
5737; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5738; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5739; SI-NEXT:    v_mov_b32_e32 v0, s0
5740; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5741; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
5742; SI-NEXT:    v_mov_b32_e32 v0, s6
5743; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
5744; SI-NEXT:    s_waitcnt vmcnt(0)
5745; SI-NEXT:    s_endpgm
5746;
5747; VI-LABEL: array_3xi32:
5748; VI:       ; %bb.0:
5749; VI-NEXT:    s_load_dword s2, s[0:1], 0x24
5750; VI-NEXT:    s_load_dword s3, s[0:1], 0x28
5751; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
5752; VI-NEXT:    s_load_dword s0, s[0:1], 0x30
5753; VI-NEXT:    s_waitcnt lgkmcnt(0)
5754; VI-NEXT:    v_mov_b32_e32 v0, s2
5755; VI-NEXT:    v_mov_b32_e32 v1, s0
5756; VI-NEXT:    flat_store_short v[0:1], v0
5757; VI-NEXT:    s_waitcnt vmcnt(0)
5758; VI-NEXT:    flat_store_dword v[0:1], v1
5759; VI-NEXT:    s_waitcnt vmcnt(0)
5760; VI-NEXT:    v_mov_b32_e32 v0, s4
5761; VI-NEXT:    flat_store_dword v[0:1], v0
5762; VI-NEXT:    s_waitcnt vmcnt(0)
5763; VI-NEXT:    v_mov_b32_e32 v0, s3
5764; VI-NEXT:    flat_store_dword v[0:1], v0
5765; VI-NEXT:    s_waitcnt vmcnt(0)
5766; VI-NEXT:    s_endpgm
5767;
5768; GFX9-LABEL: array_3xi32:
5769; GFX9:       ; %bb.0:
5770; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
5771; GFX9-NEXT:    s_load_dword s1, s[4:5], 0x4
5772; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
5773; GFX9-NEXT:    s_load_dword s3, s[4:5], 0xc
5774; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5775; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5776; GFX9-NEXT:    v_mov_b32_e32 v1, s3
5777; GFX9-NEXT:    global_store_short v[0:1], v0, off
5778; GFX9-NEXT:    s_waitcnt vmcnt(0)
5779; GFX9-NEXT:    global_store_dword v[0:1], v1, off
5780; GFX9-NEXT:    s_waitcnt vmcnt(0)
5781; GFX9-NEXT:    v_mov_b32_e32 v0, s2
5782; GFX9-NEXT:    global_store_dword v[0:1], v0, off
5783; GFX9-NEXT:    s_waitcnt vmcnt(0)
5784; GFX9-NEXT:    v_mov_b32_e32 v0, s1
5785; GFX9-NEXT:    global_store_dword v[0:1], v0, off
5786; GFX9-NEXT:    s_waitcnt vmcnt(0)
5787; GFX9-NEXT:    s_endpgm
5788;
5789; EG-LABEL: array_3xi32:
5790; EG:       ; %bb.0:
5791; EG-NEXT:    ALU 0, @10, KC0[], KC1[]
5792; EG-NEXT:    TEX 0 @8
5793; EG-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5794; EG-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5795; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.X, T4.X, 0
5796; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
5797; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T4.X, 1
5798; EG-NEXT:    CF_END
5799; EG-NEXT:    Fetch clause starting at 8:
5800; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5801; EG-NEXT:    ALU clause starting at 10:
5802; EG-NEXT:     MOV * T0.X, 0.0,
5803; EG-NEXT:    ALU clause starting at 11:
5804; EG-NEXT:     AND_INT T0.X, T0.X, literal.x,
5805; EG-NEXT:     MOV * T0.W, literal.x,
5806; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5807; EG-NEXT:     MOV T0.Y, 0.0,
5808; EG-NEXT:     MOV * T0.Z, 0.0,
5809; EG-NEXT:     MOV T1.X, KC0[2].Z,
5810; EG-NEXT:     MOV * T2.X, KC0[2].W,
5811; EG-NEXT:     MOV T3.X, KC0[3].X,
5812; EG-NEXT:     MOV * T4.X, literal.x,
5813; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5814;
5815; CM-LABEL: array_3xi32:
5816; CM:       ; %bb.0:
5817; CM-NEXT:    ALU 0, @10, KC0[], KC1[]
5818; CM-NEXT:    TEX 0 @8
5819; CM-NEXT:    ALU 9, @11, KC0[CB0:0-32], KC1[]
5820; CM-NEXT:    MEM_RAT MSKOR T0.XW, T4.X
5821; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T3.X, T4.X
5822; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2.X, T4.X
5823; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T4.X
5824; CM-NEXT:    CF_END
5825; CM-NEXT:    Fetch clause starting at 8:
5826; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 36, #3
5827; CM-NEXT:    ALU clause starting at 10:
5828; CM-NEXT:     MOV * T0.X, 0.0,
5829; CM-NEXT:    ALU clause starting at 11:
5830; CM-NEXT:     AND_INT T0.X, T0.X, literal.x,
5831; CM-NEXT:     MOV * T0.W, literal.x,
5832; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5833; CM-NEXT:     MOV T0.Y, 0.0,
5834; CM-NEXT:     MOV * T0.Z, 0.0,
5835; CM-NEXT:     MOV * T1.X, KC0[2].Z,
5836; CM-NEXT:     MOV * T2.X, KC0[2].W,
5837; CM-NEXT:     MOV * T3.X, KC0[3].X,
5838; CM-NEXT:     MOV * T4.X, literal.x,
5839; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5840  store volatile i16 %arg0, i16 addrspace(1)* undef
5841  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
5842  ret void
5843}
5844
5845; FIXME: Why not all scalar loads?
5846define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
5847; SI-LABEL: array_3xi16:
5848; SI:       ; %bb.0:
5849; SI-NEXT:    s_load_dword s4, s[0:1], 0x9
5850; SI-NEXT:    s_mov_b32 s3, 0xf000
5851; SI-NEXT:    s_mov_b32 s2, -1
5852; SI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:42
5853; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:40
5854; SI-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:38
5855; SI-NEXT:    s_waitcnt lgkmcnt(0)
5856; SI-NEXT:    v_mov_b32_e32 v3, s4
5857; SI-NEXT:    buffer_store_byte v3, off, s[0:3], 0
5858; SI-NEXT:    s_waitcnt vmcnt(0)
5859; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
5860; SI-NEXT:    s_waitcnt vmcnt(0)
5861; SI-NEXT:    buffer_store_short v1, off, s[0:3], 0
5862; SI-NEXT:    s_waitcnt vmcnt(0)
5863; SI-NEXT:    buffer_store_short v2, off, s[0:3], 0
5864; SI-NEXT:    s_waitcnt vmcnt(0)
5865; SI-NEXT:    s_endpgm
5866;
5867; VI-LABEL: array_3xi16:
5868; VI:       ; %bb.0:
5869; VI-NEXT:    s_add_u32 s2, s0, 38
5870; VI-NEXT:    s_addc_u32 s3, s1, 0
5871; VI-NEXT:    s_add_u32 s4, s2, 2
5872; VI-NEXT:    s_addc_u32 s5, s3, 0
5873; VI-NEXT:    v_mov_b32_e32 v2, s2
5874; VI-NEXT:    v_mov_b32_e32 v3, s3
5875; VI-NEXT:    s_add_u32 s2, s0, 42
5876; VI-NEXT:    s_addc_u32 s3, s1, 0
5877; VI-NEXT:    v_mov_b32_e32 v0, s4
5878; VI-NEXT:    v_mov_b32_e32 v5, s3
5879; VI-NEXT:    v_mov_b32_e32 v1, s5
5880; VI-NEXT:    v_mov_b32_e32 v4, s2
5881; VI-NEXT:    flat_load_ushort v0, v[0:1]
5882; VI-NEXT:    flat_load_ushort v1, v[2:3]
5883; VI-NEXT:    flat_load_ushort v2, v[4:5]
5884; VI-NEXT:    s_load_dword s0, s[0:1], 0x24
5885; VI-NEXT:    s_waitcnt lgkmcnt(0)
5886; VI-NEXT:    v_mov_b32_e32 v3, s0
5887; VI-NEXT:    s_waitcnt vmcnt(1)
5888; VI-NEXT:    flat_store_byte v[0:1], v3
5889; VI-NEXT:    s_waitcnt vmcnt(0)
5890; VI-NEXT:    flat_store_short v[0:1], v2
5891; VI-NEXT:    s_waitcnt vmcnt(0)
5892; VI-NEXT:    flat_store_short v[0:1], v1
5893; VI-NEXT:    s_waitcnt vmcnt(0)
5894; VI-NEXT:    flat_store_short v[0:1], v0
5895; VI-NEXT:    s_waitcnt vmcnt(0)
5896; VI-NEXT:    s_endpgm
5897;
5898; GFX9-LABEL: array_3xi16:
5899; GFX9:       ; %bb.0:
5900; GFX9-NEXT:    v_mov_b32_e32 v0, 0
5901; GFX9-NEXT:    global_load_ushort v1, v0, s[4:5] offset:2
5902; GFX9-NEXT:    global_load_ushort v2, v0, s[4:5] offset:4
5903; GFX9-NEXT:    global_load_ushort v3, v0, s[4:5] offset:6
5904; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x0
5905; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
5906; GFX9-NEXT:    v_mov_b32_e32 v0, s0
5907; GFX9-NEXT:    s_waitcnt vmcnt(2)
5908; GFX9-NEXT:    global_store_byte v[0:1], v0, off
5909; GFX9-NEXT:    s_waitcnt vmcnt(0)
5910; GFX9-NEXT:    global_store_short v[0:1], v3, off
5911; GFX9-NEXT:    s_waitcnt vmcnt(0)
5912; GFX9-NEXT:    global_store_short v[0:1], v2, off
5913; GFX9-NEXT:    s_waitcnt vmcnt(0)
5914; GFX9-NEXT:    global_store_short v[0:1], v1, off
5915; GFX9-NEXT:    s_waitcnt vmcnt(0)
5916; GFX9-NEXT:    s_endpgm
5917;
5918; EG-LABEL: array_3xi16:
5919; EG:       ; %bb.0:
5920; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
5921; EG-NEXT:    TEX 1 @12
5922; EG-NEXT:    ALU 11, @21, KC0[], KC1[]
5923; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5924; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5925; EG-NEXT:    TEX 0 @16
5926; EG-NEXT:    ALU 3, @33, KC0[], KC1[]
5927; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5928; EG-NEXT:    TEX 0 @18
5929; EG-NEXT:    ALU 3, @37, KC0[], KC1[]
5930; EG-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5931; EG-NEXT:    CF_END
5932; EG-NEXT:    Fetch clause starting at 12:
5933; EG-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5934; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5935; EG-NEXT:    Fetch clause starting at 16:
5936; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5937; EG-NEXT:    Fetch clause starting at 18:
5938; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5939; EG-NEXT:    ALU clause starting at 20:
5940; EG-NEXT:     MOV * T0.X, 0.0,
5941; EG-NEXT:    ALU clause starting at 21:
5942; EG-NEXT:     AND_INT T1.X, T1.X, literal.x,
5943; EG-NEXT:     MOV * T1.W, literal.x,
5944; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5945; EG-NEXT:     MOV * T1.Y, 0.0,
5946; EG-NEXT:     AND_INT T2.X, T2.X, literal.x,
5947; EG-NEXT:     MOV * T2.W, literal.x,
5948; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5949; EG-NEXT:     MOV T2.Y, 0.0,
5950; EG-NEXT:     MOV T1.Z, 0.0,
5951; EG-NEXT:     MOV * T2.Z, 0.0,
5952; EG-NEXT:     MOV * T3.X, literal.x,
5953; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
5954; EG-NEXT:    ALU clause starting at 33:
5955; EG-NEXT:     AND_INT T2.X, T1.X, literal.x,
5956; EG-NEXT:     MOV T2.Y, 0.0,
5957; EG-NEXT:     MOV * T2.Z, 0.0,
5958; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5959; EG-NEXT:    ALU clause starting at 37:
5960; EG-NEXT:     AND_INT T2.X, T0.X, literal.x,
5961; EG-NEXT:     MOV T2.Y, 0.0,
5962; EG-NEXT:     MOV * T2.Z, 0.0,
5963; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5964;
5965; CM-LABEL: array_3xi16:
5966; CM:       ; %bb.0:
5967; CM-NEXT:    ALU 0, @20, KC0[], KC1[]
5968; CM-NEXT:    TEX 1 @12
5969; CM-NEXT:    ALU 11, @21, KC0[], KC1[]
5970; CM-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
5971; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5972; CM-NEXT:    TEX 0 @16
5973; CM-NEXT:    ALU 3, @33, KC0[], KC1[]
5974; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5975; CM-NEXT:    TEX 0 @18
5976; CM-NEXT:    ALU 3, @37, KC0[], KC1[]
5977; CM-NEXT:    MEM_RAT MSKOR T2.XW, T3.X
5978; CM-NEXT:    CF_END
5979; CM-NEXT:    Fetch clause starting at 12:
5980; CM-NEXT:     VTX_READ_8 T1.X, T0.X, 36, #3
5981; CM-NEXT:     VTX_READ_16 T2.X, T0.X, 42, #3
5982; CM-NEXT:    Fetch clause starting at 16:
5983; CM-NEXT:     VTX_READ_16 T1.X, T0.X, 40, #3
5984; CM-NEXT:    Fetch clause starting at 18:
5985; CM-NEXT:     VTX_READ_16 T0.X, T0.X, 38, #3
5986; CM-NEXT:    ALU clause starting at 20:
5987; CM-NEXT:     MOV * T0.X, 0.0,
5988; CM-NEXT:    ALU clause starting at 21:
5989; CM-NEXT:     AND_INT T1.X, T1.X, literal.x,
5990; CM-NEXT:     MOV * T1.W, literal.x,
5991; CM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
5992; CM-NEXT:     MOV * T1.Y, 0.0,
5993; CM-NEXT:     AND_INT T2.X, T2.X, literal.x,
5994; CM-NEXT:     MOV * T2.W, literal.x,
5995; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
5996; CM-NEXT:     MOV T2.Y, 0.0,
5997; CM-NEXT:     MOV * T1.Z, 0.0,
5998; CM-NEXT:     MOV * T2.Z, 0.0,
5999; CM-NEXT:     MOV * T3.X, literal.x,
6000; CM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
6001; CM-NEXT:    ALU clause starting at 33:
6002; CM-NEXT:     AND_INT T2.X, T1.X, literal.x,
6003; CM-NEXT:     MOV T2.Y, 0.0,
6004; CM-NEXT:     MOV * T2.Z, 0.0,
6005; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6006; CM-NEXT:    ALU clause starting at 37:
6007; CM-NEXT:     AND_INT T2.X, T0.X, literal.x,
6008; CM-NEXT:     MOV T2.Y, 0.0,
6009; CM-NEXT:     MOV * T2.Z, 0.0,
6010; CM-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
6011  store volatile i8 %arg0, i8 addrspace(1)* undef
6012  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
6013  ret void
6014}
6015
6016define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
6017; SI-LABEL: small_array_round_down_offset:
6018; SI:       ; %bb.0:
6019; SI-NEXT:    s_mov_b32 s3, 0xf000
6020; SI-NEXT:    s_mov_b32 s2, -1
6021; SI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:37
6022; SI-NEXT:    s_waitcnt vmcnt(0)
6023; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
6024; SI-NEXT:    s_waitcnt vmcnt(0)
6025; SI-NEXT:    s_endpgm
6026;
6027; VI-LABEL: small_array_round_down_offset:
6028; VI:       ; %bb.0:
6029; VI-NEXT:    s_add_u32 s0, s0, 37
6030; VI-NEXT:    s_addc_u32 s1, s1, 0
6031; VI-NEXT:    v_mov_b32_e32 v0, s0
6032; VI-NEXT:    v_mov_b32_e32 v1, s1
6033; VI-NEXT:    flat_load_ubyte v0, v[0:1]
6034; VI-NEXT:    s_waitcnt vmcnt(0)
6035; VI-NEXT:    flat_store_byte v[0:1], v0
6036; VI-NEXT:    s_waitcnt vmcnt(0)
6037; VI-NEXT:    s_endpgm
6038;
6039; GFX9-LABEL: small_array_round_down_offset:
6040; GFX9:       ; %bb.0:
6041; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6042; GFX9-NEXT:    global_load_ubyte v0, v0, s[4:5] offset:1
6043; GFX9-NEXT:    s_waitcnt vmcnt(0)
6044; GFX9-NEXT:    global_store_byte v[0:1], v0, off
6045; GFX9-NEXT:    s_waitcnt vmcnt(0)
6046; GFX9-NEXT:    s_endpgm
6047;
6048; EGCM-LABEL: small_array_round_down_offset:
6049; EGCM:       ; %bb.0:
6050; EGCM-NEXT:    ALU 0, @8, KC0[], KC1[]
6051; EGCM-NEXT:    TEX 0 @6
6052; EGCM-NEXT:    ALU 6, @9, KC0[], KC1[]
6053; EGCM-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
6054; EGCM-NEXT:    CF_END
6055; EGCM-NEXT:    PAD
6056; EGCM-NEXT:    Fetch clause starting at 6:
6057; EGCM-NEXT:     VTX_READ_8 T0.X, T0.X, 37, #3
6058; EGCM-NEXT:    ALU clause starting at 8:
6059; EGCM-NEXT:     MOV * T0.X, 0.0,
6060; EGCM-NEXT:    ALU clause starting at 9:
6061; EGCM-NEXT:     AND_INT T0.X, T0.X, literal.x,
6062; EGCM-NEXT:     MOV * T0.W, literal.x,
6063; EGCM-NEXT:    255(3.573311e-43), 0(0.000000e+00)
6064; EGCM-NEXT:     MOV T0.Y, 0.0,
6065; EGCM-NEXT:     MOV * T0.Z, 0.0,
6066; EGCM-NEXT:     MOV * T1.X, literal.x,
6067; EGCM-NEXT:    0(0.000000e+00), 0(0.000000e+00)
6068  %val = extractvalue [1 x i8] %arg, 0
6069  store volatile i8 %val, i8 addrspace(1)* undef
6070  ret void
6071}
6072
6073define amdgpu_kernel void @byref_align_constant_i32_arg(i32 addrspace(1)* nocapture %out, i32 addrspace(4)* byref(i32) align(256) %in.byref, i32 %after.offset) {
6074; SI-LABEL: byref_align_constant_i32_arg:
6075; SI:       ; %bb.0:
6076; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x49
6077; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
6078; SI-NEXT:    s_mov_b32 s3, 0xf000
6079; SI-NEXT:    s_mov_b32 s2, -1
6080; SI-NEXT:    s_waitcnt lgkmcnt(0)
6081; SI-NEXT:    v_mov_b32_e32 v0, s4
6082; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6083; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6084; SI-NEXT:    v_mov_b32_e32 v0, s5
6085; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
6086; SI-NEXT:    s_waitcnt vmcnt(0)
6087; SI-NEXT:    s_endpgm
6088;
6089; VI-LABEL: byref_align_constant_i32_arg:
6090; VI:       ; %bb.0:
6091; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
6092; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x124
6093; VI-NEXT:    s_waitcnt lgkmcnt(0)
6094; VI-NEXT:    v_mov_b32_e32 v0, s2
6095; VI-NEXT:    v_mov_b32_e32 v1, s3
6096; VI-NEXT:    v_mov_b32_e32 v2, s0
6097; VI-NEXT:    v_mov_b32_e32 v3, s1
6098; VI-NEXT:    flat_store_dword v[0:1], v2
6099; VI-NEXT:    s_waitcnt vmcnt(0)
6100; VI-NEXT:    flat_store_dword v[0:1], v3
6101; VI-NEXT:    s_waitcnt vmcnt(0)
6102; VI-NEXT:    s_endpgm
6103;
6104; GFX9-LABEL: byref_align_constant_i32_arg:
6105; GFX9:       ; %bb.0:
6106; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6107; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x100
6108; GFX9-NEXT:    v_mov_b32_e32 v0, 0
6109; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6110; GFX9-NEXT:    v_mov_b32_e32 v1, s2
6111; GFX9-NEXT:    v_mov_b32_e32 v2, s3
6112; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
6113; GFX9-NEXT:    s_waitcnt vmcnt(0)
6114; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
6115; GFX9-NEXT:    s_waitcnt vmcnt(0)
6116; GFX9-NEXT:    s_endpgm
6117;
6118; EG-LABEL: byref_align_constant_i32_arg:
6119; EG:       ; %bb.0:
6120; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
6121; EG-NEXT:    TEX 0 @6
6122; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
6123; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 0
6124; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6125; EG-NEXT:    CF_END
6126; EG-NEXT:    Fetch clause starting at 6:
6127; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
6128; EG-NEXT:    ALU clause starting at 8:
6129; EG-NEXT:     MOV * T0.X, KC0[18].Y,
6130; EG-NEXT:    ALU clause starting at 9:
6131; EG-NEXT:     MOV T1.X, KC0[18].Z,
6132; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6134;
6135; CM-LABEL: byref_align_constant_i32_arg:
6136; CM:       ; %bb.0:
6137; CM-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
6138; CM-NEXT:    TEX 0 @6
6139; CM-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
6140; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0.X, T2.X
6141; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6142; CM-NEXT:    CF_END
6143; CM-NEXT:    Fetch clause starting at 6:
6144; CM-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
6145; CM-NEXT:    ALU clause starting at 8:
6146; CM-NEXT:     MOV * T0.X, KC0[18].Y,
6147; CM-NEXT:    ALU clause starting at 9:
6148; CM-NEXT:     MOV * T1.X, KC0[18].Z,
6149; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6150; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6151  %in = load i32, i32 addrspace(4)* %in.byref
6152  store volatile i32 %in, i32 addrspace(1)* %out, align 4
6153  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
6154  ret void
6155}
6156
6157define amdgpu_kernel void @byref_natural_align_constant_v16i32_arg(i32 addrspace(1)* nocapture %out, i8, <16 x i32> addrspace(4)* byref(<16 x i32>) %in.byref, i32 %after.offset) {
6158; SI-LABEL: byref_natural_align_constant_v16i32_arg:
6159; SI:       ; %bb.0:
6160; SI-NEXT:    s_load_dwordx16 s[4:19], s[0:1], 0x19
6161; SI-NEXT:    s_load_dwordx2 s[20:21], s[0:1], 0x9
6162; SI-NEXT:    s_load_dword s0, s[0:1], 0x29
6163; SI-NEXT:    s_mov_b32 s23, 0xf000
6164; SI-NEXT:    s_mov_b32 s22, -1
6165; SI-NEXT:    s_waitcnt lgkmcnt(0)
6166; SI-NEXT:    v_mov_b32_e32 v0, s16
6167; SI-NEXT:    v_mov_b32_e32 v1, s17
6168; SI-NEXT:    v_mov_b32_e32 v2, s18
6169; SI-NEXT:    v_mov_b32_e32 v3, s19
6170; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:48
6171; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6172; SI-NEXT:    v_mov_b32_e32 v0, s12
6173; SI-NEXT:    v_mov_b32_e32 v1, s13
6174; SI-NEXT:    v_mov_b32_e32 v2, s14
6175; SI-NEXT:    v_mov_b32_e32 v3, s15
6176; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:32
6177; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6178; SI-NEXT:    v_mov_b32_e32 v0, s8
6179; SI-NEXT:    v_mov_b32_e32 v1, s9
6180; SI-NEXT:    v_mov_b32_e32 v2, s10
6181; SI-NEXT:    v_mov_b32_e32 v3, s11
6182; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0 offset:16
6183; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6184; SI-NEXT:    v_mov_b32_e32 v0, s4
6185; SI-NEXT:    v_mov_b32_e32 v1, s5
6186; SI-NEXT:    v_mov_b32_e32 v2, s6
6187; SI-NEXT:    v_mov_b32_e32 v3, s7
6188; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[20:23], 0
6189; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
6190; SI-NEXT:    v_mov_b32_e32 v0, s0
6191; SI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
6192; SI-NEXT:    s_waitcnt vmcnt(0)
6193; SI-NEXT:    s_endpgm
6194;
6195; VI-LABEL: byref_natural_align_constant_v16i32_arg:
6196; VI:       ; %bb.0:
6197; VI-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
6198; VI-NEXT:    s_load_dword s18, s[0:1], 0xa4
6199; VI-NEXT:    s_load_dwordx16 s[0:15], s[0:1], 0x64
6200; VI-NEXT:    s_waitcnt lgkmcnt(0)
6201; VI-NEXT:    v_mov_b32_e32 v0, s12
6202; VI-NEXT:    s_add_u32 s12, s16, 48
6203; VI-NEXT:    v_mov_b32_e32 v1, s13
6204; VI-NEXT:    s_addc_u32 s13, s17, 0
6205; VI-NEXT:    v_mov_b32_e32 v4, s12
6206; VI-NEXT:    v_mov_b32_e32 v2, s14
6207; VI-NEXT:    v_mov_b32_e32 v3, s15
6208; VI-NEXT:    v_mov_b32_e32 v5, s13
6209; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6210; VI-NEXT:    s_waitcnt vmcnt(0)
6211; VI-NEXT:    v_mov_b32_e32 v0, s8
6212; VI-NEXT:    s_add_u32 s8, s16, 32
6213; VI-NEXT:    v_mov_b32_e32 v1, s9
6214; VI-NEXT:    s_addc_u32 s9, s17, 0
6215; VI-NEXT:    v_mov_b32_e32 v4, s8
6216; VI-NEXT:    v_mov_b32_e32 v2, s10
6217; VI-NEXT:    v_mov_b32_e32 v3, s11
6218; VI-NEXT:    v_mov_b32_e32 v5, s9
6219; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6220; VI-NEXT:    s_waitcnt vmcnt(0)
6221; VI-NEXT:    v_mov_b32_e32 v0, s4
6222; VI-NEXT:    s_add_u32 s4, s16, 16
6223; VI-NEXT:    v_mov_b32_e32 v1, s5
6224; VI-NEXT:    s_addc_u32 s5, s17, 0
6225; VI-NEXT:    v_mov_b32_e32 v4, s4
6226; VI-NEXT:    v_mov_b32_e32 v2, s6
6227; VI-NEXT:    v_mov_b32_e32 v3, s7
6228; VI-NEXT:    v_mov_b32_e32 v5, s5
6229; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6230; VI-NEXT:    s_waitcnt vmcnt(0)
6231; VI-NEXT:    v_mov_b32_e32 v4, s16
6232; VI-NEXT:    v_mov_b32_e32 v0, s0
6233; VI-NEXT:    v_mov_b32_e32 v1, s1
6234; VI-NEXT:    v_mov_b32_e32 v2, s2
6235; VI-NEXT:    v_mov_b32_e32 v3, s3
6236; VI-NEXT:    v_mov_b32_e32 v5, s17
6237; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
6238; VI-NEXT:    s_waitcnt vmcnt(0)
6239; VI-NEXT:    v_mov_b32_e32 v0, s18
6240; VI-NEXT:    flat_store_dword v[4:5], v0
6241; VI-NEXT:    s_waitcnt vmcnt(0)
6242; VI-NEXT:    s_endpgm
6243;
6244; GFX9-LABEL: byref_natural_align_constant_v16i32_arg:
6245; GFX9:       ; %bb.0:
6246; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x80
6247; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x40
6248; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
6249; GFX9-NEXT:    v_mov_b32_e32 v4, 0
6250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
6251; GFX9-NEXT:    v_mov_b32_e32 v0, s20
6252; GFX9-NEXT:    v_mov_b32_e32 v1, s21
6253; GFX9-NEXT:    v_mov_b32_e32 v2, s22
6254; GFX9-NEXT:    v_mov_b32_e32 v3, s23
6255; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
6256; GFX9-NEXT:    s_waitcnt vmcnt(0)
6257; GFX9-NEXT:    v_mov_b32_e32 v0, s16
6258; GFX9-NEXT:    v_mov_b32_e32 v1, s17
6259; GFX9-NEXT:    v_mov_b32_e32 v2, s18
6260; GFX9-NEXT:    v_mov_b32_e32 v3, s19
6261; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
6262; GFX9-NEXT:    s_waitcnt vmcnt(0)
6263; GFX9-NEXT:    v_mov_b32_e32 v0, s12
6264; GFX9-NEXT:    v_mov_b32_e32 v1, s13
6265; GFX9-NEXT:    v_mov_b32_e32 v2, s14
6266; GFX9-NEXT:    v_mov_b32_e32 v3, s15
6267; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
6268; GFX9-NEXT:    s_waitcnt vmcnt(0)
6269; GFX9-NEXT:    v_mov_b32_e32 v0, s8
6270; GFX9-NEXT:    v_mov_b32_e32 v1, s9
6271; GFX9-NEXT:    v_mov_b32_e32 v2, s10
6272; GFX9-NEXT:    v_mov_b32_e32 v3, s11
6273; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
6274; GFX9-NEXT:    s_waitcnt vmcnt(0)
6275; GFX9-NEXT:    v_mov_b32_e32 v0, s2
6276; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
6277; GFX9-NEXT:    s_waitcnt vmcnt(0)
6278; GFX9-NEXT:    s_endpgm
6279;
6280; EG-LABEL: byref_natural_align_constant_v16i32_arg:
6281; EG:       ; %bb.0:
6282; EG-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6283; EG-NEXT:    TEX 0 @16
6284; EG-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6285; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
6286; EG-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6287; EG-NEXT:    TEX 0 @18
6288; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6289; EG-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6290; EG-NEXT:    TEX 0 @20
6291; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T1.X, 0
6292; EG-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6293; EG-NEXT:    TEX 0 @22
6294; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 0
6295; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 1
6296; EG-NEXT:    CF_END
6297; EG-NEXT:    PAD
6298; EG-NEXT:    Fetch clause starting at 16:
6299; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6300; EG-NEXT:    Fetch clause starting at 18:
6301; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6302; EG-NEXT:    Fetch clause starting at 20:
6303; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6304; EG-NEXT:    Fetch clause starting at 22:
6305; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6306; EG-NEXT:    ALU clause starting at 24:
6307; EG-NEXT:     MOV * T0.X, KC0[6].Y,
6308; EG-NEXT:    ALU clause starting at 25:
6309; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6310; EG-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6311; EG-NEXT:     LSHR * T2.X, PV.W, literal.x,
6312; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6313; EG-NEXT:    ALU clause starting at 29:
6314; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6315; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6316; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6317; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6318; EG-NEXT:    ALU clause starting at 33:
6319; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6320; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6321; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
6322; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6323; EG-NEXT:    ALU clause starting at 37:
6324; EG-NEXT:     MOV T1.X, KC0[10].Y,
6325; EG-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6326; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6327;
6328; CM-LABEL: byref_natural_align_constant_v16i32_arg:
6329; CM:       ; %bb.0:
6330; CM-NEXT:    ALU 0, @24, KC0[CB0:0-32], KC1[]
6331; CM-NEXT:    TEX 0 @16
6332; CM-NEXT:    ALU 3, @25, KC0[CB0:0-32], KC1[]
6333; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
6334; CM-NEXT:    ALU 3, @29, KC0[CB0:0-32], KC1[]
6335; CM-NEXT:    TEX 0 @18
6336; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6337; CM-NEXT:    ALU 3, @33, KC0[CB0:0-32], KC1[]
6338; CM-NEXT:    TEX 0 @20
6339; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T2, T1.X
6340; CM-NEXT:    ALU 2, @37, KC0[CB0:0-32], KC1[]
6341; CM-NEXT:    TEX 0 @22
6342; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
6343; CM-NEXT:    MEM_RAT_CACHELESS STORE_DWORD T1.X, T2.X
6344; CM-NEXT:    CF_END
6345; CM-NEXT:    PAD
6346; CM-NEXT:    Fetch clause starting at 16:
6347; CM-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 48, #1
6348; CM-NEXT:    Fetch clause starting at 18:
6349; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 32, #1
6350; CM-NEXT:    Fetch clause starting at 20:
6351; CM-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 16, #1
6352; CM-NEXT:    Fetch clause starting at 22:
6353; CM-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
6354; CM-NEXT:    ALU clause starting at 24:
6355; CM-NEXT:     MOV * T0.X, KC0[6].Y,
6356; CM-NEXT:    ALU clause starting at 25:
6357; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6358; CM-NEXT:    48(6.726233e-44), 0(0.000000e+00)
6359; CM-NEXT:     LSHR * T2.X, PV.W, literal.x,
6360; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6361; CM-NEXT:    ALU clause starting at 29:
6362; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6363; CM-NEXT:    32(4.484155e-44), 0(0.000000e+00)
6364; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6365; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6366; CM-NEXT:    ALU clause starting at 33:
6367; CM-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
6368; CM-NEXT:    16(2.242078e-44), 0(0.000000e+00)
6369; CM-NEXT:     LSHR * T1.X, PV.W, literal.x,
6370; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6371; CM-NEXT:    ALU clause starting at 37:
6372; CM-NEXT:     MOV * T1.X, KC0[10].Y,
6373; CM-NEXT:     LSHR * T2.X, KC0[2].Y, literal.x,
6374; CM-NEXT:    2(2.802597e-45), 0(0.000000e+00)
6375  %in = load <16 x i32>, <16 x i32> addrspace(4)* %in.byref
6376  %cast.out = bitcast i32 addrspace(1)* %out to <16 x i32> addrspace(1)*
6377  store volatile <16 x i32> %in, <16 x i32> addrspace(1)* %cast.out, align 4
6378  store volatile i32 %after.offset, i32 addrspace(1)* %out, align 4
6379  ret void
6380}
6381