1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
5
6declare void @extern_func() #2
7
8define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
9; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
10; preserved across the call and should get 8 scratch registers.
11; GFX9-LABEL: non_preserved_vgpr_tuple8:
12; GFX9:       ; %bb.0: ; %main_body
13; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
14; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
15; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
16; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
17; GFX9-NEXT:    s_mov_b32 s4, 0
18; GFX9-NEXT:    v_writelane_b32 v40, s33, 2
19; GFX9-NEXT:    s_mov_b32 s33, s32
20; GFX9-NEXT:    v_mov_b32_e32 v36, v16
21; GFX9-NEXT:    v_mov_b32_e32 v35, v15
22; GFX9-NEXT:    v_mov_b32_e32 v34, v14
23; GFX9-NEXT:    v_mov_b32_e32 v33, v13
24; GFX9-NEXT:    v_mov_b32_e32 v32, v12
25; GFX9-NEXT:    s_mov_b32 s5, s4
26; GFX9-NEXT:    s_mov_b32 s6, s4
27; GFX9-NEXT:    s_mov_b32 s7, s4
28; GFX9-NEXT:    s_mov_b32 s8, s4
29; GFX9-NEXT:    s_mov_b32 s9, s4
30; GFX9-NEXT:    s_mov_b32 s10, s4
31; GFX9-NEXT:    s_mov_b32 s11, s4
32; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
33; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
34; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
35; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
36; GFX9-NEXT:    ;;#ASMSTART
37; GFX9-NEXT:    ;;#ASMEND
38; GFX9-NEXT:    ;;#ASMSTART
39; GFX9-NEXT:    ;;#ASMEND
40; GFX9-NEXT:    ;;#ASMSTART
41; GFX9-NEXT:    ;;#ASMEND
42; GFX9-NEXT:    ;;#ASMSTART
43; GFX9-NEXT:    ;;#ASMEND
44; GFX9-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
45; GFX9-NEXT:    s_addk_i32 s32, 0x800
46; GFX9-NEXT:    s_getpc_b64 s[4:5]
47; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
48; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
49; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
50; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
51; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
52; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
54; GFX9-NEXT:    v_mov_b32_e32 v0, v41
55; GFX9-NEXT:    v_mov_b32_e32 v1, v42
56; GFX9-NEXT:    v_mov_b32_e32 v2, v43
57; GFX9-NEXT:    v_mov_b32_e32 v3, v44
58; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
59; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
60; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
61; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
62; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
63; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
64; GFX9-NEXT:    s_addk_i32 s32, 0xf800
65; GFX9-NEXT:    v_readlane_b32 s33, v40, 2
66; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
67; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
68; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
69; GFX9-NEXT:    s_waitcnt vmcnt(0)
70; GFX9-NEXT:    s_setpc_b64 s[30:31]
71;
72; GFX10-LABEL: non_preserved_vgpr_tuple8:
73; GFX10:       ; %bb.0: ; %main_body
74; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
76; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
77; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
78; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
79; GFX10-NEXT:    s_mov_b32 exec_lo, s4
80; GFX10-NEXT:    v_mov_b32_e32 v36, v16
81; GFX10-NEXT:    v_mov_b32_e32 v35, v15
82; GFX10-NEXT:    v_mov_b32_e32 v34, v14
83; GFX10-NEXT:    v_mov_b32_e32 v33, v13
84; GFX10-NEXT:    v_mov_b32_e32 v32, v12
85; GFX10-NEXT:    s_mov_b32 s4, 0
86; GFX10-NEXT:    v_writelane_b32 v40, s33, 2
87; GFX10-NEXT:    s_mov_b32 s33, s32
88; GFX10-NEXT:    s_mov_b32 s5, s4
89; GFX10-NEXT:    s_mov_b32 s6, s4
90; GFX10-NEXT:    s_mov_b32 s7, s4
91; GFX10-NEXT:    s_mov_b32 s8, s4
92; GFX10-NEXT:    s_mov_b32 s9, s4
93; GFX10-NEXT:    s_mov_b32 s10, s4
94; GFX10-NEXT:    s_mov_b32 s11, s4
95; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
96; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
97; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
98; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
99; GFX10-NEXT:    ;;#ASMSTART
100; GFX10-NEXT:    ;;#ASMEND
101; GFX10-NEXT:    ;;#ASMSTART
102; GFX10-NEXT:    ;;#ASMEND
103; GFX10-NEXT:    ;;#ASMSTART
104; GFX10-NEXT:    ;;#ASMEND
105; GFX10-NEXT:    ;;#ASMSTART
106; GFX10-NEXT:    ;;#ASMEND
107; GFX10-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
108; GFX10-NEXT:    s_addk_i32 s32, 0x400
109; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
110; GFX10-NEXT:    s_getpc_b64 s[4:5]
111; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
112; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
113; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
114; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
115; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
116; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
118; GFX10-NEXT:    v_mov_b32_e32 v0, v41
119; GFX10-NEXT:    v_mov_b32_e32 v1, v42
120; GFX10-NEXT:    v_mov_b32_e32 v2, v43
121; GFX10-NEXT:    v_mov_b32_e32 v3, v44
122; GFX10-NEXT:    s_clause 0x3
123; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
124; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
125; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
126; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
127; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
128; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
129; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
130; GFX10-NEXT:    v_readlane_b32 s33, v40, 2
131; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
132; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
133; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
134; GFX10-NEXT:    s_mov_b32 exec_lo, s4
135; GFX10-NEXT:    s_waitcnt vmcnt(0)
136; GFX10-NEXT:    s_setpc_b64 s[30:31]
137;
138; GFX11-LABEL: non_preserved_vgpr_tuple8:
139; GFX11:       ; %bb.0: ; %main_body
140; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
142; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
143; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:16 ; 4-byte Folded Spill
144; GFX11-NEXT:    s_mov_b32 exec_lo, s0
145; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
146; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
147; GFX11-NEXT:    v_mov_b32_e32 v32, v12
148; GFX11-NEXT:    s_mov_b32 s0, 0
149; GFX11-NEXT:    v_writelane_b32 v40, s33, 2
150; GFX11-NEXT:    s_mov_b32 s33, s32
151; GFX11-NEXT:    s_mov_b32 s1, s0
152; GFX11-NEXT:    s_mov_b32 s2, s0
153; GFX11-NEXT:    s_mov_b32 s3, s0
154; GFX11-NEXT:    s_mov_b32 s4, s0
155; GFX11-NEXT:    s_mov_b32 s5, s0
156; GFX11-NEXT:    s_mov_b32 s6, s0
157; GFX11-NEXT:    s_mov_b32 s7, s0
158; GFX11-NEXT:    s_clause 0x3
159; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
160; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
161; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
162; GFX11-NEXT:    scratch_store_b32 off, v44, s33
163; GFX11-NEXT:    ;;#ASMSTART
164; GFX11-NEXT:    ;;#ASMEND
165; GFX11-NEXT:    ;;#ASMSTART
166; GFX11-NEXT:    ;;#ASMEND
167; GFX11-NEXT:    ;;#ASMSTART
168; GFX11-NEXT:    ;;#ASMEND
169; GFX11-NEXT:    ;;#ASMSTART
170; GFX11-NEXT:    ;;#ASMEND
171; GFX11-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
172; GFX11-NEXT:    s_add_i32 s32, s32, 32
173; GFX11-NEXT:    s_getpc_b64 s[0:1]
174; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
175; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
176; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
177; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
178; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
179; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
181; GFX11-NEXT:    v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42
182; GFX11-NEXT:    v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44
183; GFX11-NEXT:    s_clause 0x3
184; GFX11-NEXT:    scratch_load_b32 v44, off, s33
185; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
186; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
187; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
188; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
189; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
190; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
191; GFX11-NEXT:    v_readlane_b32 s33, v40, 2
192; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
193; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:16 ; 4-byte Folded Reload
194; GFX11-NEXT:    s_mov_b32 exec_lo, s0
195; GFX11-NEXT:    s_waitcnt vmcnt(0)
196; GFX11-NEXT:    s_setpc_b64 s[30:31]
197
198
199
200
201
202
203
204
205
206
207main_body:
208  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
209  call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
210  call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
211  call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
212  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
213  call void @extern_func()
214  ret <4 x float> %v
215}
216
217define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
218; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
219; across the call and should get allcoated to 8 CSRs.
220; Only the lower 5 sub-registers of the tuple are preserved.
221; The upper 3 sub-registers are unused.
222; GFX9-LABEL: call_preserved_vgpr_tuple8:
223; GFX9:       ; %bb.0: ; %main_body
224; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
225; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
226; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
227; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
228; GFX9-NEXT:    v_writelane_b32 v40, s33, 10
229; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
230; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
231; GFX9-NEXT:    v_writelane_b32 v40, s36, 2
232; GFX9-NEXT:    v_writelane_b32 v40, s37, 3
233; GFX9-NEXT:    v_writelane_b32 v40, s38, 4
234; GFX9-NEXT:    v_writelane_b32 v40, s39, 5
235; GFX9-NEXT:    v_writelane_b32 v40, s40, 6
236; GFX9-NEXT:    v_writelane_b32 v40, s41, 7
237; GFX9-NEXT:    s_mov_b32 s33, s32
238; GFX9-NEXT:    v_writelane_b32 v40, s42, 8
239; GFX9-NEXT:    s_mov_b32 s36, 0
240; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
241; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
242; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
243; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
244; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
245; GFX9-NEXT:    v_writelane_b32 v40, s43, 9
246; GFX9-NEXT:    v_mov_b32_e32 v45, v16
247; GFX9-NEXT:    v_mov_b32_e32 v44, v15
248; GFX9-NEXT:    v_mov_b32_e32 v43, v14
249; GFX9-NEXT:    v_mov_b32_e32 v42, v13
250; GFX9-NEXT:    v_mov_b32_e32 v41, v12
251; GFX9-NEXT:    s_mov_b32 s37, s36
252; GFX9-NEXT:    s_mov_b32 s38, s36
253; GFX9-NEXT:    s_mov_b32 s39, s36
254; GFX9-NEXT:    s_mov_b32 s40, s36
255; GFX9-NEXT:    s_mov_b32 s41, s36
256; GFX9-NEXT:    s_mov_b32 s42, s36
257; GFX9-NEXT:    s_mov_b32 s43, s36
258; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
259; GFX9-NEXT:    s_addk_i32 s32, 0x800
260; GFX9-NEXT:    s_getpc_b64 s[4:5]
261; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
262; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
263; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
264; GFX9-NEXT:    s_waitcnt vmcnt(0)
265; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
266; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
268; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
269; GFX9-NEXT:    s_nop 0
270; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
271; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
272; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
273; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
274; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
275; GFX9-NEXT:    v_readlane_b32 s43, v40, 9
276; GFX9-NEXT:    v_readlane_b32 s42, v40, 8
277; GFX9-NEXT:    v_readlane_b32 s41, v40, 7
278; GFX9-NEXT:    v_readlane_b32 s40, v40, 6
279; GFX9-NEXT:    v_readlane_b32 s39, v40, 5
280; GFX9-NEXT:    v_readlane_b32 s38, v40, 4
281; GFX9-NEXT:    v_readlane_b32 s37, v40, 3
282; GFX9-NEXT:    v_readlane_b32 s36, v40, 2
283; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
284; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
285; GFX9-NEXT:    s_addk_i32 s32, 0xf800
286; GFX9-NEXT:    v_readlane_b32 s33, v40, 10
287; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
288; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
289; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
290; GFX9-NEXT:    s_waitcnt vmcnt(0)
291; GFX9-NEXT:    s_setpc_b64 s[30:31]
292;
293; GFX10-LABEL: call_preserved_vgpr_tuple8:
294; GFX10:       ; %bb.0: ; %main_body
295; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
296; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
297; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
298; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
299; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
300; GFX10-NEXT:    s_mov_b32 exec_lo, s4
301; GFX10-NEXT:    v_writelane_b32 v40, s33, 10
302; GFX10-NEXT:    s_mov_b32 s33, s32
303; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
304; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
305; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
306; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
307; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
308; GFX10-NEXT:    s_addk_i32 s32, 0x400
309; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
310; GFX10-NEXT:    v_mov_b32_e32 v41, v16
311; GFX10-NEXT:    v_mov_b32_e32 v42, v15
312; GFX10-NEXT:    v_mov_b32_e32 v43, v14
313; GFX10-NEXT:    v_mov_b32_e32 v44, v13
314; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
315; GFX10-NEXT:    v_mov_b32_e32 v45, v12
316; GFX10-NEXT:    v_writelane_b32 v40, s36, 2
317; GFX10-NEXT:    s_mov_b32 s36, 0
318; GFX10-NEXT:    v_writelane_b32 v40, s37, 3
319; GFX10-NEXT:    s_mov_b32 s37, s36
320; GFX10-NEXT:    v_writelane_b32 v40, s38, 4
321; GFX10-NEXT:    s_mov_b32 s38, s36
322; GFX10-NEXT:    v_writelane_b32 v40, s39, 5
323; GFX10-NEXT:    s_mov_b32 s39, s36
324; GFX10-NEXT:    v_writelane_b32 v40, s40, 6
325; GFX10-NEXT:    s_mov_b32 s40, s36
326; GFX10-NEXT:    v_writelane_b32 v40, s41, 7
327; GFX10-NEXT:    s_mov_b32 s41, s36
328; GFX10-NEXT:    v_writelane_b32 v40, s42, 8
329; GFX10-NEXT:    s_mov_b32 s42, s36
330; GFX10-NEXT:    v_writelane_b32 v40, s43, 9
331; GFX10-NEXT:    s_mov_b32 s43, s36
332; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
333; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
334; GFX10-NEXT:    s_getpc_b64 s[4:5]
335; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
336; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
337; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
338; GFX10-NEXT:    s_waitcnt vmcnt(0)
339; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
340; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
341; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
342; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
343; GFX10-NEXT:    s_clause 0x4
344; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33
345; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4
346; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8
347; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12
348; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
349; GFX10-NEXT:    v_readlane_b32 s43, v40, 9
350; GFX10-NEXT:    v_readlane_b32 s42, v40, 8
351; GFX10-NEXT:    v_readlane_b32 s41, v40, 7
352; GFX10-NEXT:    v_readlane_b32 s40, v40, 6
353; GFX10-NEXT:    v_readlane_b32 s39, v40, 5
354; GFX10-NEXT:    v_readlane_b32 s38, v40, 4
355; GFX10-NEXT:    v_readlane_b32 s37, v40, 3
356; GFX10-NEXT:    v_readlane_b32 s36, v40, 2
357; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
358; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
359; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
360; GFX10-NEXT:    v_readlane_b32 s33, v40, 10
361; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
362; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
363; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
364; GFX10-NEXT:    s_mov_b32 exec_lo, s4
365; GFX10-NEXT:    s_waitcnt vmcnt(0)
366; GFX10-NEXT:    s_setpc_b64 s[30:31]
367;
368; GFX11-LABEL: call_preserved_vgpr_tuple8:
369; GFX11:       ; %bb.0: ; %main_body
370; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
372; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
373; GFX11-NEXT:    scratch_store_b32 off, v40, s32 offset:20 ; 4-byte Folded Spill
374; GFX11-NEXT:    s_mov_b32 exec_lo, s0
375; GFX11-NEXT:    v_writelane_b32 v40, s33, 10
376; GFX11-NEXT:    s_mov_b32 s33, s32
377; GFX11-NEXT:    s_clause 0x4
378; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
379; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:12
380; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:8
381; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:4
382; GFX11-NEXT:    scratch_store_b32 off, v45, s33
383; GFX11-NEXT:    s_add_i32 s32, s32, 32
384; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
385; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
386; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
387; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
388; GFX11-NEXT:    v_mov_b32_e32 v45, v12
389; GFX11-NEXT:    v_writelane_b32 v40, s36, 2
390; GFX11-NEXT:    s_mov_b32 s36, 0
391; GFX11-NEXT:    v_writelane_b32 v40, s37, 3
392; GFX11-NEXT:    s_mov_b32 s37, s36
393; GFX11-NEXT:    v_writelane_b32 v40, s38, 4
394; GFX11-NEXT:    s_mov_b32 s38, s36
395; GFX11-NEXT:    v_writelane_b32 v40, s39, 5
396; GFX11-NEXT:    s_mov_b32 s39, s36
397; GFX11-NEXT:    v_writelane_b32 v40, s40, 6
398; GFX11-NEXT:    s_mov_b32 s40, s36
399; GFX11-NEXT:    v_writelane_b32 v40, s41, 7
400; GFX11-NEXT:    s_mov_b32 s41, s36
401; GFX11-NEXT:    v_writelane_b32 v40, s42, 8
402; GFX11-NEXT:    s_mov_b32 s42, s36
403; GFX11-NEXT:    v_writelane_b32 v40, s43, 9
404; GFX11-NEXT:    s_mov_b32 s43, s36
405; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
406; GFX11-NEXT:    s_getpc_b64 s[0:1]
407; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
408; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
409; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
410; GFX11-NEXT:    s_waitcnt vmcnt(0)
411; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
412; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
414; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[36:43], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
415; GFX11-NEXT:    s_clause 0x4
416; GFX11-NEXT:    scratch_load_b32 v45, off, s33
417; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:4
418; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:8
419; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:12
420; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
421; GFX11-NEXT:    v_readlane_b32 s43, v40, 9
422; GFX11-NEXT:    v_readlane_b32 s42, v40, 8
423; GFX11-NEXT:    v_readlane_b32 s41, v40, 7
424; GFX11-NEXT:    v_readlane_b32 s40, v40, 6
425; GFX11-NEXT:    v_readlane_b32 s39, v40, 5
426; GFX11-NEXT:    v_readlane_b32 s38, v40, 4
427; GFX11-NEXT:    v_readlane_b32 s37, v40, 3
428; GFX11-NEXT:    v_readlane_b32 s36, v40, 2
429; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
430; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
431; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
432; GFX11-NEXT:    v_readlane_b32 s33, v40, 10
433; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
434; GFX11-NEXT:    scratch_load_b32 v40, off, s32 offset:20 ; 4-byte Folded Reload
435; GFX11-NEXT:    s_mov_b32 exec_lo, s0
436; GFX11-NEXT:    s_waitcnt vmcnt(0)
437; GFX11-NEXT:    s_setpc_b64 s[30:31]
438
439
440
441
442
443
444
445main_body:
446  %v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
447  store <4 x float> %v, <4 x float> addrspace(1)* undef
448  call void @extern_func()
449  %v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0)
450  ret <4 x float> %v1
451}
452
453declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
454
455attributes #0 = { nounwind writeonly }
456attributes #1 = { nounwind readonly }
457attributes #2 = { "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
458