1;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
2;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
3
4; Check that WQM isn't triggered by image load/store intrinsics.
5;
6;CHECK-LABEL: {{^}}test1:
7;CHECK-NOT: s_wqm
8define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
9main_body:
10  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
11  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
12  ret <4 x float> %tex
13}
14
15; Check that WQM is triggered by image samples and left untouched for loads...
16;
17;CHECK-LABEL: {{^}}test2:
18;CHECK-NEXT: ; %main_body
19;CHECK-NEXT: s_wqm_b64 exec, exec
20;CHECK-NOT: exec
21define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
22main_body:
23  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
24  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
25  %c.3 = extractelement <4 x i32> %c.2, i32 0
26  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
27  %data = load float, float addrspace(1)* %gep
28
29  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
30
31  ret void
32}
33
34; ... but disabled for stores (and, in this simple case, not re-enabled).
35;
36;CHECK-LABEL: {{^}}test3:
37;CHECK-NEXT: ; %main_body
38;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
39;CHECK-NEXT: s_wqm_b64 exec, exec
40;CHECK: s_and_b64 exec, exec, [[ORIG]]
41;CHECK: image_sample
42;CHECK: store
43;CHECK-NOT: exec
44;CHECK: .size test3
45define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
46main_body:
47  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
48  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
49  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
50
51  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i1 0, i1 0)
52
53  ret <4 x float> %tex
54}
55
56; Check that WQM is re-enabled when required.
57;
58;CHECK-LABEL: {{^}}test4:
59;CHECK-NEXT: ; %main_body
60;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
61;CHECK-NEXT: s_wqm_b64 exec, exec
62;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1
63;CHECK: s_and_b64 exec, exec, [[ORIG]]
64;CHECK: store
65;CHECK: s_wqm_b64 exec, exec
66;CHECK: image_sample
67;CHECK: image_sample
68define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
69main_body:
70  %c.1 = mul i32 %c, %d
71
72  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
73
74  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
75  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
76  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
77  ret <4 x float> %dtex
78}
79
80; Check a case of one branch of an if-else requiring WQM, the other requiring
81; exact.
82;
83; Note: In this particular case, the save-and-restore could be avoided if the
84; analysis understood that the two branches of the if-else are mutually
85; exclusive.
86;
87;CHECK-LABEL: {{^}}test_control_flow_0:
88;CHECK-NEXT: ; %main_body
89;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
90;CHECK-NEXT: s_wqm_b64 exec, exec
91;CHECK: %ELSE
92;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
93;CHECK: store
94;CHECK: s_mov_b64 exec, [[SAVED]]
95;CHECK: %IF
96;CHECK: image_sample
97;CHECK: image_sample
98define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
99main_body:
100  %cmp = icmp eq i32 %z, 0
101  br i1 %cmp, label %IF, label %ELSE
102
103IF:
104  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
105  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
106  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
107  %data.if = extractelement <4 x float> %dtex, i32 0
108  br label %END
109
110ELSE:
111  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
112  br label %END
113
114END:
115  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
116  ret float %r
117}
118
119; Reverse branch order compared to the previous test.
120;
121;CHECK-LABEL: {{^}}test_control_flow_1:
122;CHECK-NEXT: ; %main_body
123;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
124;CHECK-NEXT: s_wqm_b64 exec, exec
125;CHECK: %IF
126;CHECK: image_sample
127;CHECK: image_sample
128;CHECK: %Flow
129;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]],
130;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
131;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
132;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
133;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
134;CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %ELSE
135;CHECK: store_dword
136;CHECK: [[END_BB]]: ; %END
137;CHECK: s_or_b64 exec, exec,
138;CHECK: v_mov_b32_e32 v0
139;CHECK: ; return
140define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
141main_body:
142  %cmp = icmp eq i32 %z, 0
143  br i1 %cmp, label %ELSE, label %IF
144
145IF:
146  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
147  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
148  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
149  %data.if = extractelement <4 x float> %dtex, i32 0
150  br label %END
151
152ELSE:
153  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i1 0, i1 0)
154  br label %END
155
156END:
157  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
158  ret float %r
159}
160
161; Check that branch conditions are properly marked as needing WQM...
162;
163;CHECK-LABEL: {{^}}test_control_flow_2:
164;CHECK-NEXT: ; %main_body
165;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
166;CHECK-NEXT: s_wqm_b64 exec, exec
167;CHECK: s_and_b64 exec, exec, [[ORIG]]
168;CHECK: store
169;CHECK: s_wqm_b64 exec, exec
170;CHECK: load
171;CHECK: s_and_b64 exec, exec, [[ORIG]]
172;CHECK: store
173;CHECK: s_wqm_b64 exec, exec
174;CHECK: v_cmp
175define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
176main_body:
177  %idx.1 = extractelement <3 x i32> %idx, i32 0
178  %data.1 = extractelement <2 x float> %data, i32 0
179  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
180
181  ; The load that determines the branch (and should therefore be WQM) is
182  ; surrounded by stores that require disabled WQM.
183  %idx.2 = extractelement <3 x i32> %idx, i32 1
184  %z = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i1 0, i1 0)
185
186  %idx.3 = extractelement <3 x i32> %idx, i32 2
187  %data.3 = extractelement <2 x float> %data, i32 1
188  call void @llvm.amdgcn.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i1 0, i1 0)
189
190  %cc = fcmp ogt float %z, 0.0
191  br i1 %cc, label %IF, label %ELSE
192
193IF:
194  %coord.IF = mul i32 %coord, 3
195  br label %END
196
197ELSE:
198  %coord.ELSE = mul i32 %coord, 4
199  br label %END
200
201END:
202  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
203  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
204  ret <4 x float> %tex
205}
206
207; ... but only if they really do need it.
208;
209;CHECK-LABEL: {{^}}test_control_flow_3:
210;CHECK-NEXT: ; %main_body
211;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
212;CHECK-NEXT: s_wqm_b64 exec, exec
213;CHECK: image_sample
214;CHECK: s_and_b64 exec, exec, [[ORIG]]
215;CHECK: image_sample
216;CHECK: v_cmp
217;CHECK: store
218define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
219main_body:
220  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
221  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
222  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
223  %dtex.1 = extractelement <4 x float> %dtex, i32 0
224
225  call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
226
227  %cc = fcmp ogt float %dtex.1, 0.0
228  br i1 %cc, label %IF, label %ELSE
229
230IF:
231  %tex.IF = fmul float %dtex.1, 3.0
232  br label %END
233
234ELSE:
235  %tex.ELSE = fmul float %dtex.1, 4.0
236  br label %END
237
238END:
239  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
240  ret float %tex.END
241}
242
243; Another test that failed at some point because of terminator handling.
244;
245;CHECK-LABEL: {{^}}test_control_flow_4:
246;CHECK-NEXT: ; %main_body
247;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
248;CHECK-NEXT: s_wqm_b64 exec, exec
249;CHECK: %IF
250;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]],  [[ORIG]]
251;CHECK: load
252;CHECK: store
253;CHECK: s_mov_b64 exec, [[SAVE]]
254;CHECK: %END
255;CHECK: image_sample
256;CHECK: image_sample
257define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
258main_body:
259  %cond = icmp eq i32 %y, 0
260  br i1 %cond, label %IF, label %END
261
262IF:
263  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
264  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
265  br label %END
266
267END:
268  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
269  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
270  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
271  ret <4 x float> %dtex
272}
273
274; Kill is performed in WQM mode so that uniform kill behaves correctly ...
275;
276;CHECK-LABEL: {{^}}test_kill_0:
277;CHECK-NEXT: ; %main_body
278;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
279;CHECK-NEXT: s_wqm_b64 exec, exec
280;CHECK: s_and_b64 exec, exec, [[ORIG]]
281;CHECK: image_sample
282;CHECK: buffer_store_dword
283;CHECK: s_wqm_b64 exec, exec
284;CHECK: v_cmpx_
285;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
286;CHECK: buffer_store_dword
287;CHECK: s_mov_b64 exec, [[SAVE]]
288;CHECK: image_sample
289define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
290main_body:
291  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
292
293  %idx.0 = extractelement <2 x i32> %idx, i32 0
294  %data.0 = extractelement <2 x float> %data, i32 0
295  call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
296
297  call void @llvm.AMDGPU.kill(float %z)
298
299  %idx.1 = extractelement <2 x i32> %idx, i32 1
300  %data.1 = extractelement <2 x float> %data, i32 1
301  call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
302
303  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
304  %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
305  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
306  %out = fadd <4 x float> %tex, %dtex
307
308  ret <4 x float> %out
309}
310
311; ... but only if WQM is necessary.
312;
313; CHECK-LABEL: {{^}}test_kill_1:
314; CHECK-NEXT: ; %main_body
315; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
316; CHECK: s_wqm_b64 exec, exec
317; CHECK: image_sample
318; CHECK: s_and_b64 exec, exec, [[ORIG]]
319; CHECK: image_sample
320; CHECK: buffer_store_dword
321; CHECK-NOT: wqm
322; CHECK: v_cmpx_
323define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
324main_body:
325  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
326  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
327  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
328
329  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
330
331  call void @llvm.AMDGPU.kill(float %z)
332
333  ret <4 x float> %dtex
334}
335
336; Check prolog shaders.
337;
338; CHECK-LABEL: {{^}}test_prolog_1:
339; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
340; CHECK: s_wqm_b64 exec, exec
341; CHECK: v_add_f32_e32 v0,
342; CHECK: s_and_b64 exec, exec, [[ORIG]]
343define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
344main_body:
345  %s = fadd float %a, %b
346  ret float %s
347}
348
349; CHECK-LABEL: {{^}}test_loop_vcc:
350; CHECK-NEXT: ; %entry
351; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
352; CHECK: s_wqm_b64 exec, exec
353; CHECK: s_and_b64 exec, exec, [[LIVE]]
354; CHECK: image_store
355; CHECK: s_wqm_b64 exec, exec
356; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
357; CHECK-DAG: v_mov_b32_e32 [[SEVEN:v[0-9]+]], 0x40e00000
358
359; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
360; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
361; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
362; CHECK: s_cbranch_vccz [[LOOPHDR]]
363; CHECK: ; %break
364
365; CHECK: ; return
366define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
367entry:
368  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
369  br label %loop
370
371loop:
372  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
373  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
374  %cc = fcmp ogt float %ctr.iv, 7.0
375  br i1 %cc, label %break, label %body
376
377body:
378  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
379  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
380  %ctr.next = fadd float %ctr.iv, 2.0
381  br label %loop
382
383break:
384  ret <4 x float> %c.iv
385}
386
387; Only intrinsic stores need exact execution -- other stores do not have
388; externally visible effects and may require WQM for correctness.
389;
390; CHECK-LABEL: {{^}}test_alloca:
391; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
392; CHECK: s_wqm_b64 exec, exec
393
394; CHECK: s_and_b64 exec, exec, [[LIVE]]
395; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
396; CHECK: s_wqm_b64 exec, exec
397; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}}
398; CHECK: s_and_b64 exec, exec, [[LIVE]]
399; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
400; CHECK: s_wqm_b64 exec, exec
401; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
402
403; CHECK: s_and_b64 exec, exec, [[LIVE]]
404; CHECK: image_sample
405; CHECK: buffer_store_dwordx4
406define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
407entry:
408  %array = alloca [32 x i32], align 4
409
410  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
411
412  %s.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 0
413  store volatile i32 %a, i32* %s.gep, align 4
414
415  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i1 0, i1 0)
416
417  %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
418  %c = load i32, i32* %c.gep, align 4
419
420  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
421
422  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
423
424  ret void
425}
426
427; Must return to exact at the end of a non-void returning shader,
428; otherwise the EXEC mask exported by the epilog will be wrong. This is true
429; even if the shader has no kills, because a kill could have happened in a
430; previous shader fragment.
431;
432; CHECK-LABEL: {{^}}test_nonvoid_return:
433; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
434; CHECK: s_wqm_b64 exec, exec
435;
436; CHECK: s_and_b64 exec, exec, [[LIVE]]
437; CHECK-NOT: exec
438define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
439  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
440  %tex.i = bitcast <4 x float> %tex to <4 x i32>
441  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
442  ret <4 x float> %dtex
443}
444
445; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
446; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
447; CHECK: s_wqm_b64 exec, exec
448;
449; CHECK: s_and_b64 exec, exec, [[LIVE]]
450; CHECK-NOT: exec
451define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
452entry:
453  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
454  %tex.i = bitcast <4 x float> %tex to <4 x i32>
455  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
456
457  %cc = icmp sgt i32 %c, 0
458  br i1 %cc, label %if, label %else
459
460if:
461  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
462  unreachable
463
464else:
465  ret <4 x float> %dtex
466}
467
468; Test awareness that s_wqm_b64 clobbers SCC.
469;
470; CHECK-LABEL: {{^}}test_scc:
471; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
472; CHECK: s_wqm_b64 exec, exec
473; CHECK: s_cmp_
474; CHECK-NEXT: s_cbranch_scc
475; CHECK: ; %if
476; CHECK: s_and_b64 exec, exec, [[ORIG]]
477; CHECK: image_sample
478; CHECK: ; %else
479; CHECK: s_and_b64 exec, exec, [[ORIG]]
480; CHECK: image_sample
481; CHECK: ; %end
482define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
483main_body:
484  %cc = icmp sgt i32 %sel, 0
485  br i1 %cc, label %if, label %else
486
487if:
488  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
489  br label %end
490
491else:
492  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
493  br label %end
494
495end:
496  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
497
498  call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
499
500  ret <4 x float> %r
501}
502
503
504declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
505declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
506declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
507
508declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
509declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
510
511declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
512declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
513declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
514
515declare void @llvm.AMDGPU.kill(float)
516declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
517
518attributes #1 = { nounwind }
519attributes #2 = { nounwind readonly }
520attributes #3 = { nounwind readnone }
521attributes #4 = { "amdgpu-ps-wqm-outputs" }
522