1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
4
5; Check that WQM isn't triggered by image load/store intrinsics.
6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
7; GFX9-W64-LABEL: test1:
8; GFX9-W64:       ; %bb.0: ; %main_body
9; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
10; GFX9-W64-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
11; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
12; GFX9-W64-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
13; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-W64-NEXT:    ; return to shader part epilog
15;
16; GFX10-W32-LABEL: test1:
17; GFX10-W32:       ; %bb.0: ; %main_body
18; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
19; GFX10-W32-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
20; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
21; GFX10-W32-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
22; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
23; GFX10-W32-NEXT:    ; return to shader part epilog
24main_body:
25  %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
26  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
27  ret <4 x float> %tex
28}
29
30; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
31define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
32; GFX9-W64-LABEL: test2:
33; GFX9-W64:       ; %bb.0: ; %main_body
34; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
35; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
36; GFX9-W64-NEXT:    s_mov_b32 m0, s3
37; GFX9-W64-NEXT:    s_nop 0
38; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
39; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
40; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
41; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
42; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
43; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
44; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
45; GFX9-W64-NEXT:    ; return to shader part epilog
46;
47; GFX10-W32-LABEL: test2:
48; GFX10-W32:       ; %bb.0: ; %main_body
49; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
50; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
51; GFX10-W32-NEXT:    s_mov_b32 m0, s3
52; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
53; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
54; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
55; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
56; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
57; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
58; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
59; GFX10-W32-NEXT:    ; return to shader part epilog
60main_body:
61  %inst23 = extractelement <2 x float> %pos, i32 0
62  %inst24 = extractelement <2 x float> %pos, i32 1
63  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
64  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
65  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
66  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
67  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
68  ret <4 x float> %tex
69}
70
71; ... but disabled for stores (and, in this simple case, not re-enabled) ...
72define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
73; GFX9-W64-LABEL: test3:
74; GFX9-W64:       ; %bb.0: ; %main_body
75; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
76; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
77; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
78; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
79; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
81; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-W64-NEXT:    ; return to shader part epilog
83;
84; GFX10-W32-LABEL: test3:
85; GFX10-W32:       ; %bb.0: ; %main_body
86; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
87; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
88; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
89; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
90; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
91; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
92; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
93; GFX10-W32-NEXT:    ; return to shader part epilog
94main_body:
95  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
96  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
97  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
98
99  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
100
101  ret <4 x float> %tex
102}
103
104; ... and disabled for export.
105define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
106; GFX9-W64-LABEL: test3x:
107; GFX9-W64:       ; %bb.0: ; %main_body
108; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
109; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
110; GFX9-W64-NEXT:    s_mov_b32 m0, s3
111; GFX9-W64-NEXT:    s_nop 0
112; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
113; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
114; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
115; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
116; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
117; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
118; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
119; GFX9-W64-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
120; GFX9-W64-NEXT:    s_endpgm
121;
122; GFX10-W32-LABEL: test3x:
123; GFX10-W32:       ; %bb.0: ; %main_body
124; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
125; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
126; GFX10-W32-NEXT:    s_mov_b32 m0, s3
127; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
128; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
129; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
130; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
131; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
132; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
133; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
134; GFX10-W32-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
135; GFX10-W32-NEXT:    s_endpgm
136main_body:
137  %inst23 = extractelement <2 x float> %pos, i32 0
138  %inst24 = extractelement <2 x float> %pos, i32 1
139  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
140  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
141  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
142  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
143  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
144  %tex.0 = extractelement <4 x float> %tex, i32 0
145  %tex.1 = extractelement <4 x float> %tex, i32 1
146  %tex.2 = extractelement <4 x float> %tex, i32 2
147  %tex.3 = extractelement <4 x float> %tex, i32 3
148  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
149  ret void
150}
151
152; Check that WQM is re-enabled when required.
153define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
154; GFX9-W64-LABEL: test4:
155; GFX9-W64:       ; %bb.0: ; %main_body
156; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
157; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
158; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
159; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
160; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
161; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
162; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
163; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
164; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
165; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
166; GFX9-W64-NEXT:    ; return to shader part epilog
167;
168; GFX10-W32-LABEL: test4:
169; GFX10-W32:       ; %bb.0: ; %main_body
170; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
171; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
172; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
173; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
174; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
175; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
176; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
177; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
178; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
179; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
180; GFX10-W32-NEXT:    ; return to shader part epilog
181main_body:
182  %c.1 = mul i32 %c, %d
183
184  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
185  %c.1.bc = bitcast i32 %c.1 to float
186  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
187  %tex0 = extractelement <4 x float> %tex, i32 0
188  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
189  ret <4 x float> %dtex
190}
191
192; Check that WQM is triggered by the wqm intrinsic.
193; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
194; does not happen - the v_add should write the return reg directly.
195define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
196; GFX9-W64-LABEL: test5:
197; GFX9-W64:       ; %bb.0: ; %main_body
198; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
199; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
200; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
201; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
202; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
203; GFX9-W64-NEXT:    s_nop 0
204; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
205; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
206; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
207; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
208; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
209; GFX9-W64-NEXT:    ; return to shader part epilog
210;
211; GFX10-W32-LABEL: test5:
212; GFX10-W32:       ; %bb.0: ; %main_body
213; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
214; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
215; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
216; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
217; GFX10-W32-NEXT:    s_clause 0x1
218; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
219; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
220; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
221; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
222; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
223; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
224; GFX10-W32-NEXT:    ; return to shader part epilog
225main_body:
226  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
227  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
228  %out = fadd float %src0, %src1
229  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
230  ret float %out.0
231}
232
233; Check that the wqm intrinsic works correctly for integers.
234define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
235; GFX9-W64-LABEL: test6:
236; GFX9-W64:       ; %bb.0: ; %main_body
237; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
238; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
239; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
240; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
241; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
242; GFX9-W64-NEXT:    s_nop 0
243; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
244; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
245; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
246; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
247; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
248; GFX9-W64-NEXT:    ; return to shader part epilog
249;
250; GFX10-W32-LABEL: test6:
251; GFX10-W32:       ; %bb.0: ; %main_body
252; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
253; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
254; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
255; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
256; GFX10-W32-NEXT:    s_clause 0x1
257; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
258; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
259; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
260; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
261; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
262; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
263; GFX10-W32-NEXT:    ; return to shader part epilog
264main_body:
265  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
266  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
267  %out = fadd float %src0, %src1
268  %out.0 = bitcast float %out to i32
269  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
270  %out.2 = bitcast i32 %out.1 to float
271  ret float %out.2
272}
273
274; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
275
276; Check that WWM is triggered by the wwm intrinsic.
277define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
278; GFX9-W64-LABEL: test_wwm1:
279; GFX9-W64:       ; %bb.0: ; %main_body
280; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
281; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
282; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
283; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
284; GFX9-W64-NEXT:    s_nop 0
285; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
286; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
287; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
288; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
289; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
290; GFX9-W64-NEXT:    ; return to shader part epilog
291;
292; GFX10-W32-LABEL: test_wwm1:
293; GFX10-W32:       ; %bb.0: ; %main_body
294; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
295; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
296; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
297; GFX10-W32-NEXT:    s_clause 0x1
298; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
299; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
300; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
301; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
302; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
303; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
304; GFX10-W32-NEXT:    ; return to shader part epilog
305main_body:
306  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
307  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
308  %out = fadd float %src0, %src1
309  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
310  ret float %out.0
311}
312
313; Same as above, but with an integer type.
314define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
315; GFX9-W64-LABEL: test_wwm2:
316; GFX9-W64:       ; %bb.0: ; %main_body
317; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
318; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
319; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
320; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
321; GFX9-W64-NEXT:    s_nop 0
322; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
323; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
324; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
325; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
326; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
327; GFX9-W64-NEXT:    ; return to shader part epilog
328;
329; GFX10-W32-LABEL: test_wwm2:
330; GFX10-W32:       ; %bb.0: ; %main_body
331; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
332; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
333; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
334; GFX10-W32-NEXT:    s_clause 0x1
335; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
336; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
337; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
338; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
339; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
340; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
341; GFX10-W32-NEXT:    ; return to shader part epilog
342main_body:
343  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
344  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
345  %src0.0 = bitcast float %src0 to i32
346  %src1.0 = bitcast float %src1 to i32
347  %out = add i32 %src0.0, %src1.0
348  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
349  %out.1 = bitcast i32 %out.0 to float
350  ret float %out.1
351}
352
353; Check that we don't leave WWM on for computations that don't require WWM,
354; since that will lead clobbering things that aren't supposed to be clobbered
355; in cases like this.
356; We enforce this by checking that v_add gets emitted in the same block as
357; WWM computations.
358define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
359; GFX9-W64-LABEL: test_wwm3:
360; GFX9-W64:       ; %bb.0: ; %main_body
361; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
362; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
363; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
364; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
365; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
366; GFX9-W64-NEXT:    s_cbranch_execz .LBB9_2
367; GFX9-W64-NEXT:  ; %bb.1: ; %if
368; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
369; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
370; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
371; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
373; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
374; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
375; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
376; GFX9-W64-NEXT:  .LBB9_2: ; %endif
377; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
378; GFX9-W64-NEXT:    ; return to shader part epilog
379;
380; GFX10-W32-LABEL: test_wwm3:
381; GFX10-W32:       ; %bb.0: ; %main_body
382; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
383; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
384; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
385; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
386; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
387; GFX10-W32-NEXT:    s_cbranch_execz .LBB9_2
388; GFX10-W32-NEXT:  ; %bb.1: ; %if
389; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
390; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
391; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
392; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
393; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
394; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
395; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
396; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
397; GFX10-W32-NEXT:  .LBB9_2: ; %endif
398; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
399; GFX10-W32-NEXT:    ; return to shader part epilog
400main_body:
401  ; use mbcnt to make sure the branch is divergent
402  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
403  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
404  %cc = icmp uge i32 %hi, 32
405  br i1 %cc, label %endif, label %if
406
407if:
408  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
409  %out = fadd float %src, %src
410  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
411  %out.1 = fadd float %src, %out.0
412  br label %endif
413
414endif:
415  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
416  ret float %out.2
417}
418
419; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
420; write could clobber disabled channels in the non-WWM one.
421; We enforce this by checking that v_mov gets emitted in the same block as
422; WWM computations.
423define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
424; GFX9-W64-LABEL: test_wwm4:
425; GFX9-W64:       ; %bb.0: ; %main_body
426; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
427; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
428; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
429; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
430; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
431; GFX9-W64-NEXT:    s_cbranch_execz .LBB10_2
432; GFX9-W64-NEXT:  ; %bb.1: ; %if
433; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
434; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
435; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
436; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
437; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
438; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
439; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
440; GFX9-W64-NEXT:  .LBB10_2: ; %endif
441; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
442; GFX9-W64-NEXT:    ; return to shader part epilog
443;
444; GFX10-W32-LABEL: test_wwm4:
445; GFX10-W32:       ; %bb.0: ; %main_body
446; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
447; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
448; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
449; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
450; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
451; GFX10-W32-NEXT:    s_cbranch_execz .LBB10_2
452; GFX10-W32-NEXT:  ; %bb.1: ; %if
453; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
454; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
455; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
456; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
457; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
458; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
459; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
460; GFX10-W32-NEXT:  .LBB10_2: ; %endif
461; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
462; GFX10-W32-NEXT:    ; return to shader part epilog
463main_body:
464  ; use mbcnt to make sure the branch is divergent
465  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
466  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
467  %cc = icmp uge i32 %hi, 32
468  br i1 %cc, label %endif, label %if
469
470if:
471  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
472  %out = fadd float %src, %src
473  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
474  br label %endif
475
476endif:
477  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
478  ret float %out.1
479}
480
481; Make sure the transition from Exact to WWM then WQM works properly.
482define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
483; GFX9-W64-LABEL: test_wwm5:
484; GFX9-W64:       ; %bb.0: ; %main_body
485; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
486; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
487; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
488; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
489; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
490; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
491; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
492; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
493; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
494; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
495; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
496; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
497; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
498; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
499; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
500; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
501; GFX9-W64-NEXT:    ; return to shader part epilog
502;
503; GFX10-W32-LABEL: test_wwm5:
504; GFX10-W32:       ; %bb.0: ; %main_body
505; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
506; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
507; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
508; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
509; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
510; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
511; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
513; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
514; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
515; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
516; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
517; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
518; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
519; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
520; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
521; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
522; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
523; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
524; GFX10-W32-NEXT:    ; return to shader part epilog
525main_body:
526  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
527  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
528  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
529  %temp = fadd float %src1, %src1
530  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
531  %out = fadd float %temp.0, %temp.0
532  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
533  ret float %out.0
534}
535
536; Check that WWM is turned on correctly across basic block boundaries.
537; if..then..endif version
538;SI-CHECK: buffer_load_dword
539;VI-CHECK: flat_load_dword
540;SI-CHECK: buffer_load_dword
541;VI-CHECK: flat_load_dword
542define amdgpu_ps float @test_wwm6_then() {
543; GFX9-W64-LABEL: test_wwm6_then:
544; GFX9-W64:       ; %bb.0: ; %main_body
545; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
546; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
547; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
548; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
549; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
550; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
551; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
552; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
553; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
554; GFX9-W64-NEXT:    s_cbranch_execz .LBB12_2
555; GFX9-W64-NEXT:  ; %bb.1: ; %if
556; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
557; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
558; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
560; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
561; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
562; GFX9-W64-NEXT:  .LBB12_2: ; %endif
563; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
564; GFX9-W64-NEXT:    ; return to shader part epilog
565;
566; GFX10-W32-LABEL: test_wwm6_then:
567; GFX10-W32:       ; %bb.0: ; %main_body
568; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
569; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
570; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
571; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
572; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
573; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
574; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
575; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
576; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
577; GFX10-W32-NEXT:    s_cbranch_execz .LBB12_2
578; GFX10-W32-NEXT:  ; %bb.1: ; %if
579; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
580; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
581; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
582; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
583; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
584; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
585; GFX10-W32-NEXT:  .LBB12_2: ; %endif
586; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
587; GFX10-W32-NEXT:    ; return to shader part epilog
588main_body:
589  %src0 = load volatile float, float addrspace(1)* undef
590  ; use mbcnt to make sure the branch is divergent
591  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
592  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
593  %cc = icmp uge i32 %hi, 32
594  br i1 %cc, label %endif, label %if
595
596if:
597  %src1 = load volatile float, float addrspace(1)* undef
598  %out = fadd float %src0, %src1
599  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
600  br label %endif
601
602endif:
603  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
604  ret float %out.1
605}
606
607; Check that WWM is turned on correctly across basic block boundaries.
608; loop version
609;SI-CHECK: buffer_load_dword
610;VI-CHECK: flat_load_dword
611;SI-CHECK: buffer_load_dword
612;VI-CHECK: flat_load_dword
613define amdgpu_ps float @test_wwm6_loop() {
614; GFX9-W64-LABEL: test_wwm6_loop:
615; GFX9-W64:       ; %bb.0: ; %main_body
616; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
617; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
618; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
619; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
620; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
621; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
622; GFX9-W64-NEXT:  .LBB13_1: ; %loop
623; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
624; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
625; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
626; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
627; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
628; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
629; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
630; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
631; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
632; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
633; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
634; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
635; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
636; GFX9-W64-NEXT:    s_cbranch_execnz .LBB13_1
637; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
638; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
639; GFX9-W64-NEXT:    ; return to shader part epilog
640;
641; GFX10-W32-LABEL: test_wwm6_loop:
642; GFX10-W32:       ; %bb.0: ; %main_body
643; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
644; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
645; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
646; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
647; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
648; GFX10-W32-NEXT:    s_mov_b32 s0, 0
649; GFX10-W32-NEXT:  .LBB13_1: ; %loop
650; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
651; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
652; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
653; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
654; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
655; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
656; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
657; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
658; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
659; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
660; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
661; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
662; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
663; GFX10-W32-NEXT:    s_cbranch_execnz .LBB13_1
664; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
665; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
666; GFX10-W32-NEXT:    ; return to shader part epilog
667main_body:
668  %src0 = load volatile float, float addrspace(1)* undef
669  ; use mbcnt to make sure the branch is divergent
670  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
671  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
672  br label %loop
673
674loop:
675  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
676  %src1 = load volatile float, float addrspace(1)* undef
677  %out = fadd float %src0, %src1
678  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
679  %counter.1 = sub i32 %counter, 1
680  %cc = icmp ne i32 %counter.1, 0
681  br i1 %cc, label %loop, label %endloop
682
683endloop:
684  ret float %out.0
685}
686
687; Check that @llvm.amdgcn.set.inactive disables WWM.
688define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
689; GFX9-W64-LABEL: test_wwm_set_inactive1:
690; GFX9-W64:       ; %bb.0: ; %main_body
691; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
692; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
693; GFX9-W64-NEXT:    s_not_b64 exec, exec
694; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
695; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
696; GFX9-W64-NEXT:    s_not_b64 exec, exec
697; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
698; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
699; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
700; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
701; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
702; GFX9-W64-NEXT:    s_endpgm
703;
704; GFX10-W32-LABEL: test_wwm_set_inactive1:
705; GFX10-W32:       ; %bb.0: ; %main_body
706; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
707; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
708; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
709; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
710; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
711; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
712; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
713; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
714; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
715; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
716; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
717; GFX10-W32-NEXT:    s_endpgm
718main_body:
719  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
720  %src.0 = bitcast float %src to i32
721  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
722  %out = add i32 %src.1, %src.1
723  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
724  %out.1 = bitcast i32 %out.0 to float
725  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
726  ret void
727}
728
729; Check that Strict WQM is triggered by the strict_wqm intrinsic.
730define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
731; GFX9-W64-LABEL: test_strict_wqm1:
732; GFX9-W64:       ; %bb.0: ; %main_body
733; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
734; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
735; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
736; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
737; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
738; GFX9-W64-NEXT:    s_nop 0
739; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
740; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
741; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
742; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
743; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
744; GFX9-W64-NEXT:    ; return to shader part epilog
745;
746; GFX10-W32-LABEL: test_strict_wqm1:
747; GFX10-W32:       ; %bb.0: ; %main_body
748; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
749; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
750; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
751; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
752; GFX10-W32-NEXT:    s_clause 0x1
753; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
754; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
755; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
756; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
757; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
758; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
759; GFX10-W32-NEXT:    ; return to shader part epilog
760main_body:
761  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
762  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
763  %out = fadd float %src0, %src1
764  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
765  ret float %out.0
766}
767
768; Same as above, but with an integer type.
769define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
770; GFX9-W64-LABEL: test_strict_wqm2:
771; GFX9-W64:       ; %bb.0: ; %main_body
772; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
773; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
774; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
775; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
776; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
777; GFX9-W64-NEXT:    s_nop 0
778; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
779; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
780; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
781; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
782; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
783; GFX9-W64-NEXT:    ; return to shader part epilog
784;
785; GFX10-W32-LABEL: test_strict_wqm2:
786; GFX10-W32:       ; %bb.0: ; %main_body
787; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
788; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
789; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
790; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
791; GFX10-W32-NEXT:    s_clause 0x1
792; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
793; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
794; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
795; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
796; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
797; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
798; GFX10-W32-NEXT:    ; return to shader part epilog
799main_body:
800  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
801  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
802  %src0.0 = bitcast float %src0 to i32
803  %src1.0 = bitcast float %src1 to i32
804  %out = add i32 %src0.0, %src1.0
805  %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
806  %out.1 = bitcast i32 %out.0 to float
807  ret float %out.1
808}
809
810; Check that we don't leave Strict WQM on for computations that don't require it,
811; since that will lead clobbering things that aren't supposed to be clobbered
812; in cases like this.
813; We enforce this by checking that v_add gets emitted in the same block as
814; WWM computations.
815define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
816; GFX9-W64-LABEL: test_strict_wqm3:
817; GFX9-W64:       ; %bb.0: ; %main_body
818; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
819; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
820; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
821; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
822; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
823; GFX9-W64-NEXT:    s_cbranch_execz .LBB17_2
824; GFX9-W64-NEXT:  ; %bb.1: ; %if
825; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
826; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
827; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
828; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
829; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
830; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
831; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
832; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
833; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
834; GFX9-W64-NEXT:  .LBB17_2: ; %endif
835; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
836; GFX9-W64-NEXT:    ; return to shader part epilog
837;
838; GFX10-W32-LABEL: test_strict_wqm3:
839; GFX10-W32:       ; %bb.0: ; %main_body
840; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
841; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
842; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
843; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
844; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
845; GFX10-W32-NEXT:    s_cbranch_execz .LBB17_2
846; GFX10-W32-NEXT:  ; %bb.1: ; %if
847; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
848; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
849; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
850; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
851; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
852; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
853; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
854; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
855; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
856; GFX10-W32-NEXT:  .LBB17_2: ; %endif
857; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
858; GFX10-W32-NEXT:    ; return to shader part epilog
859main_body:
860  ; use mbcnt to make sure the branch is divergent
861  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
862  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
863  %cc = icmp uge i32 %hi, 32
864  br i1 %cc, label %endif, label %if
865
866if:
867  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
868  %out = fadd float %src, %src
869  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
870  %out.1 = fadd float %src, %out.0
871  br label %endif
872
873endif:
874  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
875  ret float %out.2
876}
877
878; Check that Strict WQM writes aren't coalesced with non-strict writes, since
879; the Strict WQM write could clobber disabled channels in the non-strict one.
880; We enforce this by checking that v_mov gets emitted in the same block as
881; WWM computations.
882define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
883; GFX9-W64-LABEL: test_strict_wqm4:
884; GFX9-W64:       ; %bb.0: ; %main_body
885; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
886; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
887; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
888; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
889; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
890; GFX9-W64-NEXT:    s_cbranch_execz .LBB18_2
891; GFX9-W64-NEXT:  ; %bb.1: ; %if
892; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
893; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
894; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
895; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
896; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
897; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
898; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
899; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
900; GFX9-W64-NEXT:  .LBB18_2: ; %endif
901; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
902; GFX9-W64-NEXT:    ; return to shader part epilog
903;
904; GFX10-W32-LABEL: test_strict_wqm4:
905; GFX10-W32:       ; %bb.0: ; %main_body
906; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
907; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
908; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
909; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
910; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
911; GFX10-W32-NEXT:    s_cbranch_execz .LBB18_2
912; GFX10-W32-NEXT:  ; %bb.1: ; %if
913; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
914; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
915; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
916; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
917; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
918; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
919; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
920; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
921; GFX10-W32-NEXT:  .LBB18_2: ; %endif
922; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
923; GFX10-W32-NEXT:    ; return to shader part epilog
924main_body:
925  ; use mbcnt to make sure the branch is divergent
926  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
927  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
928  %cc = icmp uge i32 %hi, 32
929  br i1 %cc, label %endif, label %if
930
931if:
932  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
933  %out = fadd float %src, %src
934  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
935  br label %endif
936
937endif:
938  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
939  ret float %out.1
940}
941
942; Make sure the transition from Exact to Strict WQM then WQM works properly.
943define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
944; GFX9-W64-LABEL: test_strict_wqm5:
945; GFX9-W64:       ; %bb.0: ; %main_body
946; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
947; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
948; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
949; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
950; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
951; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
952; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
953; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
954; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
955; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
956; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
957; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
958; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
959; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
960; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
961; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
962; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
963; GFX9-W64-NEXT:    ; return to shader part epilog
964;
965; GFX10-W32-LABEL: test_strict_wqm5:
966; GFX10-W32:       ; %bb.0: ; %main_body
967; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
968; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
969; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
970; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
971; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
972; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
973; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
974; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
975; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
976; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
977; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
978; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
979; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
980; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
981; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
982; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
983; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
984; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
985; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
986; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
987; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
988; GFX10-W32-NEXT:    ; return to shader part epilog
989main_body:
990  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
991  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
992  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
993  %temp = fadd float %src1, %src1
994  %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
995  %out = fadd float %temp.0, %temp.0
996  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
997  ret float %out.0
998}
999
1000; Check that Strict WQM is turned on correctly across basic block boundaries.
1001; if..then..endif version
1002;SI-CHECK: buffer_load_dword
1003;VI-CHECK: flat_load_dword
1004;SI-CHECK: buffer_load_dword
1005;VI-CHECK: flat_load_dword
1006define amdgpu_ps float @test_strict_wqm6_then() {
1007; GFX9-W64-LABEL: test_strict_wqm6_then:
1008; GFX9-W64:       ; %bb.0: ; %main_body
1009; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1010; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1011; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1012; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1013; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1014; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1015; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1016; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
1017; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1018; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1019; GFX9-W64-NEXT:    s_cbranch_execz .LBB20_2
1020; GFX9-W64-NEXT:  ; %bb.1: ; %if
1021; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1022; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1023; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1024; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1025; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
1026; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1027; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1028; GFX9-W64-NEXT:  .LBB20_2: ; %endif
1029; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1030; GFX9-W64-NEXT:    ; return to shader part epilog
1031;
1032; GFX10-W32-LABEL: test_strict_wqm6_then:
1033; GFX10-W32:       ; %bb.0: ; %main_body
1034; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1035; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1036; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1037; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1038; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1039; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1040; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1041; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
1042; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1043; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1044; GFX10-W32-NEXT:    s_cbranch_execz .LBB20_2
1045; GFX10-W32-NEXT:  ; %bb.1: ; %if
1046; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1047; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1048; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1049; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1050; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
1051; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1052; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1053; GFX10-W32-NEXT:  .LBB20_2: ; %endif
1054; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1055; GFX10-W32-NEXT:    ; return to shader part epilog
1056main_body:
1057  %src0 = load volatile float, float addrspace(1)* undef
1058  ; use mbcnt to make sure the branch is divergent
1059  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1060  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1061  %cc = icmp uge i32 %hi, 32
1062  br i1 %cc, label %endif, label %if
1063
1064if:
1065  %src1 = load volatile float, float addrspace(1)* undef
1066  %out = fadd float %src0, %src1
1067  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1068  br label %endif
1069
1070endif:
1071  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1072  ret float %out.1
1073}
1074
1075; Check that Strict WQM is turned on correctly across basic block boundaries.
1076; loop version
1077;SI-CHECK: buffer_load_dword
1078;VI-CHECK: flat_load_dword
1079;SI-CHECK: buffer_load_dword
1080;VI-CHECK: flat_load_dword
1081define amdgpu_ps float @test_strict_wqm6_loop() {
1082; GFX9-W64-LABEL: test_strict_wqm6_loop:
1083; GFX9-W64:       ; %bb.0: ; %main_body
1084; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1085; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1086; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1087; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1088; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1089; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
1090; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
1091; GFX9-W64-NEXT:  .LBB21_1: ; %loop
1092; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1093; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1094; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1095; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1096; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1098; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
1099; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1100; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1101; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1102; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
1103; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1104; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1105; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1106; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1107; GFX9-W64-NEXT:    s_cbranch_execnz .LBB21_1
1108; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
1109; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1110; GFX9-W64-NEXT:    ; return to shader part epilog
1111;
1112; GFX10-W32-LABEL: test_strict_wqm6_loop:
1113; GFX10-W32:       ; %bb.0: ; %main_body
1114; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1115; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1116; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1117; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1118; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1119; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
1120; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1121; GFX10-W32-NEXT:  .LBB21_1: ; %loop
1122; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1123; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1124; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1125; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1126; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1127; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1128; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1129; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1130; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1131; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
1132; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1133; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1134; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1135; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
1136; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1137; GFX10-W32-NEXT:    s_cbranch_execnz .LBB21_1
1138; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
1139; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1140; GFX10-W32-NEXT:    ; return to shader part epilog
1141main_body:
1142  %src0 = load volatile float, float addrspace(1)* undef
1143  ; use mbcnt to make sure the branch is divergent
1144  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1145  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1146  br label %loop
1147
1148loop:
1149  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
1150  %src1 = load volatile float, float addrspace(1)* undef
1151  %out = fadd float %src0, %src1
1152  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1153  %counter.1 = sub i32 %counter, 1
1154  %cc = icmp ne i32 %counter.1, 0
1155  br i1 %cc, label %loop, label %endloop
1156
1157endloop:
1158  ret float %out.0
1159}
1160
1161; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1162define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1163; GFX9-W64-LABEL: test_set_inactive2:
1164; GFX9-W64:       ; %bb.0: ; %main_body
1165; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1166; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1167; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
1168; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s0
1169; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
1170; GFX9-W64-NEXT:    s_nop 0
1171; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
1172; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
1173; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
1174; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1175; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1176; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
1177; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1178; GFX9-W64-NEXT:    s_endpgm
1179;
1180; GFX10-W32-LABEL: test_set_inactive2:
1181; GFX10-W32:       ; %bb.0: ; %main_body
1182; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1183; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1184; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s1
1185; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1186; GFX10-W32-NEXT:    s_clause 0x1
1187; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1188; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1189; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
1190; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec
1191; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1192; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1193; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1194; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1195; GFX10-W32-NEXT:    s_endpgm
1196main_body:
1197  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1198  %src1.0 = bitcast float %src1 to i32
1199  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1200  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1201  %src0.0 = bitcast float %src0 to i32
1202  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1203  %out = add i32 %src0.1, %src1.1
1204  %out.0 = bitcast i32 %out to float
1205  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1206  ret void
1207}
1208
1209; Check a case of one branch of an if-else requiring WQM, the other requiring
1210; exact.
1211; Note: In this particular case, the save-and-restore could be avoided if the
1212; analysis understood that the two branches of the if-else are mutually
1213; exclusive.
1214define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1215; GFX9-W64-LABEL: test_control_flow_0:
1216; GFX9-W64:       ; %bb.0: ; %main_body
1217; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1218; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1219; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1220; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1221; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1222; GFX9-W64-NEXT:    s_cbranch_execz .LBB23_2
1223; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1224; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1225; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1226; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1227; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1228; GFX9-W64-NEXT:  .LBB23_2: ; %Flow
1229; GFX9-W64-NEXT:    s_or_saveexec_b64 s[14:15], s[14:15]
1230; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[14:15]
1231; GFX9-W64-NEXT:    s_cbranch_execz .LBB23_4
1232; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1233; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1234; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1235; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1236; GFX9-W64-NEXT:  .LBB23_4: ; %END
1237; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1238; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1239; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1240; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1241; GFX9-W64-NEXT:    ; return to shader part epilog
1242;
1243; GFX10-W32-LABEL: test_control_flow_0:
1244; GFX10-W32:       ; %bb.0: ; %main_body
1245; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1246; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1247; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1248; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1249; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1250; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
1251; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1252; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1253; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1254; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1255; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1256; GFX10-W32-NEXT:  .LBB23_2: ; %Flow
1257; GFX10-W32-NEXT:    s_or_saveexec_b32 s13, s13
1258; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s13
1259; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_4
1260; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1261; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1262; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1263; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1264; GFX10-W32-NEXT:  .LBB23_4: ; %END
1265; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1266; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1267; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1268; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1269; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1270; GFX10-W32-NEXT:    ; return to shader part epilog
1271main_body:
1272  %cmp = icmp eq i32 %z, 0
1273  br i1 %cmp, label %IF, label %ELSE
1274
1275IF:
1276  %c.bc = bitcast i32 %c to float
1277  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1278  %tex0 = extractelement <4 x float> %tex, i32 0
1279  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1280  %data.if = extractelement <4 x float> %dtex, i32 0
1281  br label %END
1282
1283ELSE:
1284  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
1285  br label %END
1286
1287END:
1288  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1289  ret float %r
1290}
1291
1292; Reverse branch order compared to the previous test.
1293define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1294; GFX9-W64-LABEL: test_control_flow_1:
1295; GFX9-W64:       ; %bb.0: ; %main_body
1296; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1297; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1298; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1299; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1300; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1301; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_2
1302; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1303; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1304; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1305; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1306; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1307; GFX9-W64-NEXT:  .LBB24_2: ; %Flow
1308; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[14:15]
1309; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1310; GFX9-W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
1311; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1312; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_4
1313; GFX9-W64-NEXT:  ; %bb.3: ; %ELSE
1314; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1315; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1316; GFX9-W64-NEXT:  .LBB24_4: ; %END
1317; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1318; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1319; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1320; GFX9-W64-NEXT:    ; return to shader part epilog
1321;
1322; GFX10-W32-LABEL: test_control_flow_1:
1323; GFX10-W32:       ; %bb.0: ; %main_body
1324; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1325; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1326; GFX10-W32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1327; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1328; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1329; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
1330; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1331; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1332; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1333; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1334; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1335; GFX10-W32-NEXT:  .LBB24_2: ; %Flow
1336; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s13
1337; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1338; GFX10-W32-NEXT:    s_and_b32 s0, exec_lo, s0
1339; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1340; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_4
1341; GFX10-W32-NEXT:  ; %bb.3: ; %ELSE
1342; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1343; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1344; GFX10-W32-NEXT:  .LBB24_4: ; %END
1345; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1346; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1347; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1348; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1349; GFX10-W32-NEXT:    ; return to shader part epilog
1350main_body:
1351  %cmp = icmp eq i32 %z, 0
1352  br i1 %cmp, label %ELSE, label %IF
1353
1354IF:
1355  %c.bc = bitcast i32 %c to float
1356  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1357  %tex0 = extractelement <4 x float> %tex, i32 0
1358  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1359  %data.if = extractelement <4 x float> %dtex, i32 0
1360  br label %END
1361
1362ELSE:
1363  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
1364  br label %END
1365
1366END:
1367  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1368  ret float %r
1369}
1370
1371; Check that branch conditions are properly marked as needing WQM...
1372define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1373; GFX9-W64-LABEL: test_control_flow_2:
1374; GFX9-W64:       ; %bb.0: ; %main_body
1375; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1376; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1377; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1378; GFX9-W64-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1379; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1380; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1381; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1382; GFX9-W64-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1383; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1384; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
1385; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1386; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1387; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1388; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1389; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1390; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1391; GFX9-W64-NEXT:    ; implicit-def: $vgpr5
1392; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1393; GFX9-W64-NEXT:    s_or_saveexec_b64 s[14:15], s[14:15]
1394; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[14:15]
1395; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1396; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
1397; GFX9-W64-NEXT:  ; %bb.4: ; %END
1398; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1399; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1400; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1401; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1402; GFX9-W64-NEXT:    ; return to shader part epilog
1403;
1404; GFX10-W32-LABEL: test_control_flow_2:
1405; GFX10-W32:       ; %bb.0: ; %main_body
1406; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1407; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1408; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1409; GFX10-W32-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1410; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1411; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1412; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1413; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1414; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1415; GFX10-W32-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1416; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1417; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1418; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1419; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1420; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1421; GFX10-W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1422; GFX10-W32-NEXT:    ; implicit-def: $vgpr5
1423; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1424; GFX10-W32-NEXT:    s_or_saveexec_b32 s13, s13
1425; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s13
1426; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1427; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
1428; GFX10-W32-NEXT:  ; %bb.4: ; %END
1429; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1430; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1431; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1432; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1433; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1434; GFX10-W32-NEXT:    ; return to shader part epilog
1435main_body:
1436  %idx.1 = extractelement <3 x i32> %idx, i32 0
1437  %data.1 = extractelement <2 x float> %data, i32 0
1438  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
1439
1440  ; The load that determines the branch (and should therefore be WQM) is
1441  ; surrounded by stores that require disabled WQM.
1442  %idx.2 = extractelement <3 x i32> %idx, i32 1
1443  %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0)
1444
1445  %idx.3 = extractelement <3 x i32> %idx, i32 2
1446  %data.3 = extractelement <2 x float> %data, i32 1
1447  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0)
1448
1449  %cc = fcmp ogt float %z, 0.0
1450  br i1 %cc, label %IF, label %ELSE
1451
1452IF:
1453  %coord.IF = mul i32 %coord, 3
1454  br label %END
1455
1456ELSE:
1457  %coord.ELSE = mul i32 %coord, 4
1458  br label %END
1459
1460END:
1461  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1462  %coord.END.bc = bitcast i32 %coord.END to float
1463  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1464  ret <4 x float> %tex
1465}
1466
1467; ... but only if they really do need it.
1468define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1469; GFX9-W64-LABEL: test_control_flow_3:
1470; GFX9-W64:       ; %bb.0: ; %main_body
1471; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1472; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1473; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1474; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1475; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1476; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1477; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1478; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v1
1479; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1480; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1481; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1482; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1483; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1484; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1485; GFX9-W64-NEXT:    ; implicit-def: $vgpr1
1486; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1487; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
1488; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1489; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1490; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1491; GFX9-W64-NEXT:  ; %bb.4: ; %END
1492; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1493; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1494; GFX9-W64-NEXT:    ; return to shader part epilog
1495;
1496; GFX10-W32-LABEL: test_control_flow_3:
1497; GFX10-W32:       ; %bb.0: ; %main_body
1498; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1499; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1500; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1501; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1502; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1503; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1504; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1505; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v1
1506; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1507; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1508; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1509; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
1510; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1511; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1512; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
1513; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1514; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s0
1515; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1516; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1517; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1518; GFX10-W32-NEXT:  ; %bb.4: ; %END
1519; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1520; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1521; GFX10-W32-NEXT:    ; return to shader part epilog
1522main_body:
1523  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1524  %tex0 = extractelement <4 x float> %tex, i32 0
1525  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1526  %dtex.1 = extractelement <4 x float> %dtex, i32 0
1527  call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1528
1529  %cc = fcmp ogt float %dtex.1, 0.0
1530  br i1 %cc, label %IF, label %ELSE
1531
1532IF:
1533  %tex.IF = fmul float %dtex.1, 3.0
1534  br label %END
1535
1536ELSE:
1537  %tex.ELSE = fmul float %dtex.1, 4.0
1538  br label %END
1539
1540END:
1541  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1542  ret float %tex.END
1543}
1544
1545; Another test that failed at some point because of terminator handling.
1546define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1547; GFX9-W64-LABEL: test_control_flow_4:
1548; GFX9-W64:       ; %bb.0: ; %main_body
1549; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1550; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1551; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1552; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1553; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_2
1554; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1555; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1556; GFX9-W64-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1557; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 1
1558; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1559; GFX9-W64-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1560; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1561; GFX9-W64-NEXT:  .LBB27_2: ; %END
1562; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1563; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1564; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1565; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1566; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1567; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1568; GFX9-W64-NEXT:    ; return to shader part epilog
1569;
1570; GFX10-W32-LABEL: test_control_flow_4:
1571; GFX10-W32:       ; %bb.0: ; %main_body
1572; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1573; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1574; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1575; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1576; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
1577; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1578; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1579; GFX10-W32-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1580; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 1
1581; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1582; GFX10-W32-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1583; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1584; GFX10-W32-NEXT:  .LBB27_2: ; %END
1585; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1586; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1587; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1588; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1589; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1590; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1591; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1592; GFX10-W32-NEXT:    ; return to shader part epilog
1593main_body:
1594  %cond = icmp eq i32 %y, 0
1595  br i1 %cond, label %IF, label %END
1596
1597IF:
1598  %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0)
1599  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
1600  br label %END
1601
1602END:
1603  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1604  %tex0 = extractelement <4 x float> %tex, i32 0
1605  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1606  ret <4 x float> %dtex
1607}
1608
1609; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1610define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1611; GFX9-W64-LABEL: test_kill_0:
1612; GFX9-W64:       ; %bb.0: ; %main_body
1613; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1614; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1615; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1616; GFX9-W64-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1617; GFX9-W64-NEXT:    s_nop 0
1618; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1619; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1620; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v6
1621; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1622; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB28_2
1623; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1624; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1625; GFX9-W64-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1626; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1627; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1628; GFX9-W64-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1629; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1630; GFX9-W64-NEXT:    v_add_f32_e32 v0, v7, v11
1631; GFX9-W64-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1632; GFX9-W64-NEXT:    v_add_f32_e32 v1, v8, v12
1633; GFX9-W64-NEXT:    v_add_f32_e32 v2, v9, v13
1634; GFX9-W64-NEXT:    v_add_f32_e32 v3, v10, v14
1635; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1636; GFX9-W64-NEXT:    s_branch .LBB28_3
1637; GFX9-W64-NEXT:  .LBB28_2:
1638; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1639; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1640; GFX9-W64-NEXT:    s_endpgm
1641; GFX9-W64-NEXT:  .LBB28_3:
1642;
1643; GFX10-W32-LABEL: test_kill_0:
1644; GFX10-W32:       ; %bb.0: ; %main_body
1645; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1646; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1647; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1648; GFX10-W32-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1649; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1650; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1651; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1652; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1653; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB28_2
1654; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1655; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1656; GFX10-W32-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1657; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1658; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1659; GFX10-W32-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1660; GFX10-W32-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1661; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1662; GFX10-W32-NEXT:    v_add_f32_e32 v4, v8, v12
1663; GFX10-W32-NEXT:    v_add_f32_e32 v5, v10, v14
1664; GFX10-W32-NEXT:    v_add_f32_e32 v0, v7, v11
1665; GFX10-W32-NEXT:    v_add_f32_e32 v2, v9, v13
1666; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v4
1667; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v5
1668; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1669; GFX10-W32-NEXT:    s_branch .LBB28_3
1670; GFX10-W32-NEXT:  .LBB28_2:
1671; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1672; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1673; GFX10-W32-NEXT:    s_endpgm
1674; GFX10-W32-NEXT:  .LBB28_3:
1675main_body:
1676  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1677  %idx.0 = extractelement <2 x i32> %idx, i32 0
1678  %data.0 = extractelement <2 x float> %data, i32 0
1679  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0)
1680
1681  %z.cmp = fcmp olt float %z, 0.0
1682  call void @llvm.amdgcn.kill(i1 %z.cmp)
1683
1684  %idx.1 = extractelement <2 x i32> %idx, i32 1
1685  %data.1 = extractelement <2 x float> %data, i32 1
1686  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
1687  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1688  %tex2.0 = extractelement <4 x float> %tex2, i32 0
1689  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1690  %out = fadd <4 x float> %tex, %dtex
1691
1692  ret <4 x float> %out
1693}
1694
1695; ... but only if WQM is necessary.
1696; CHECK-LABEL: {{^}}test_kill_1:
1697; CHECK-NEXT: ; %main_body
1698; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
1699; CHECK: s_wqm_b64 exec, exec
1700; CHECK: image_sample
1701; CHECK: s_and_b64 exec, exec, [[ORIG]]
1702; CHECK: image_sample
1703; CHECK-NOT: wqm
1704; CHECK-DAG: buffer_store_dword
1705; CHECK-DAG: v_cmp_
1706define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1707; GFX9-W64-LABEL: test_kill_1:
1708; GFX9-W64:       ; %bb.0: ; %main_body
1709; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1710; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v2
1711; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1712; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v0
1713; GFX9-W64-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1714; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1715; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1716; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1717; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v4
1718; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1719; GFX9-W64-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1720; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB29_2
1721; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1722; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1723; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1724; GFX9-W64-NEXT:    s_branch .LBB29_3
1725; GFX9-W64-NEXT:  .LBB29_2:
1726; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1727; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1728; GFX9-W64-NEXT:    s_endpgm
1729; GFX9-W64-NEXT:  .LBB29_3:
1730;
1731; GFX10-W32-LABEL: test_kill_1:
1732; GFX10-W32:       ; %bb.0: ; %main_body
1733; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1734; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v2
1735; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1736; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v0
1737; GFX10-W32-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1738; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1739; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1740; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1741; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1742; GFX10-W32-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1743; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1744; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB29_2
1745; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1746; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1747; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1748; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1749; GFX10-W32-NEXT:    s_branch .LBB29_3
1750; GFX10-W32-NEXT:  .LBB29_2:
1751; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1752; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1753; GFX10-W32-NEXT:    s_endpgm
1754; GFX10-W32-NEXT:  .LBB29_3:
1755main_body:
1756  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1757  %tex0 = extractelement <4 x float> %tex, i32 0
1758  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1759
1760  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
1761
1762  %z.cmp = fcmp olt float %z, 0.0
1763  call void @llvm.amdgcn.kill(i1 %z.cmp)
1764
1765  ret <4 x float> %dtex
1766}
1767
1768; Check prolog shaders.
1769; CHECK-LABEL: {{^}}test_prolog_1:
1770; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
1771; CHECK: s_wqm_b64 exec, exec
1772; CHECK: v_add_f32_e32 v0,
1773; CHECK: s_and_b64 exec, exec, [[ORIG]]
1774define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1775; GFX9-W64-LABEL: test_prolog_1:
1776; GFX9-W64:       ; %bb.0: ; %main_body
1777; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1778; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1779; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
1780; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1781; GFX9-W64-NEXT:    ; return to shader part epilog
1782;
1783; GFX10-W32-LABEL: test_prolog_1:
1784; GFX10-W32:       ; %bb.0: ; %main_body
1785; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1786; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1787; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
1788; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1789; GFX10-W32-NEXT:    ; return to shader part epilog
1790main_body:
1791  %s = fadd float %a, %b
1792  ret float %s
1793}
1794
1795; CHECK-LABEL: {{^}}test_loop_vcc:
1796; CHECK-NEXT: ; %entry
1797; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
1798; CHECK: s_wqm_b64 exec, exec
1799; CHECK: v_mov
1800; CHECK: v_mov
1801; CHECK: v_mov
1802; CHECK: v_mov
1803; CHECK: s_and_b64 exec, exec, [[LIVE]]
1804; CHECK: image_store
1805; CHECK: s_wqm_b64 exec, exec
1806; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
1807; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
1808
1809; CHECK: [[LOOPHDR:.LBB[0-9]+_[0-9]+]]: ; %body
1810; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
1811; CHECK: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; %loop
1812; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
1813; CHECK: s_cbranch_vccz [[LOOPHDR]]
1814
1815; CHECK: ; %break
1816; CHECK: ; return
1817define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1818; GFX9-W64-LABEL: test_loop_vcc:
1819; GFX9-W64:       ; %bb.0: ; %entry
1820; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
1821; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1822; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
1823; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
1824; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
1825; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
1826; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
1827; GFX9-W64-NEXT:    s_mov_b32 s0, 0
1828; GFX9-W64-NEXT:    s_mov_b32 s1, s0
1829; GFX9-W64-NEXT:    s_mov_b32 s2, s0
1830; GFX9-W64-NEXT:    s_mov_b32 s3, s0
1831; GFX9-W64-NEXT:    s_mov_b32 s4, s0
1832; GFX9-W64-NEXT:    s_mov_b32 s5, s0
1833; GFX9-W64-NEXT:    s_mov_b32 s6, s0
1834; GFX9-W64-NEXT:    s_mov_b32 s7, s0
1835; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1836; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1837; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
1838; GFX9-W64-NEXT:    s_mov_b32 s10, 0x40e00000
1839; GFX9-W64-NEXT:    s_branch .LBB31_2
1840; GFX9-W64-NEXT:  .LBB31_1: ; %body
1841; GFX9-W64-NEXT:    ; in Loop: Header=BB31_2 Depth=1
1842; GFX9-W64-NEXT:    s_mov_b32 s1, s0
1843; GFX9-W64-NEXT:    s_mov_b32 s2, s0
1844; GFX9-W64-NEXT:    s_mov_b32 s3, s0
1845; GFX9-W64-NEXT:    s_mov_b32 s4, s0
1846; GFX9-W64-NEXT:    s_mov_b32 s5, s0
1847; GFX9-W64-NEXT:    s_mov_b32 s6, s0
1848; GFX9-W64-NEXT:    s_mov_b32 s7, s0
1849; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1850; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
1851; GFX9-W64-NEXT:    s_mov_b64 s[2:3], 0
1852; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_4
1853; GFX9-W64-NEXT:  .LBB31_2: ; %loop
1854; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1855; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1856; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
1857; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v8
1858; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
1859; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
1860; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
1861; GFX9-W64-NEXT:    s_and_b64 vcc, exec, vcc
1862; GFX9-W64-NEXT:    s_cbranch_vccz .LBB31_1
1863; GFX9-W64-NEXT:  ; %bb.3:
1864; GFX9-W64-NEXT:    s_mov_b64 s[2:3], -1
1865; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1866; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
1867; GFX9-W64-NEXT:  .LBB31_4: ; %break
1868; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
1869; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1870; GFX9-W64-NEXT:    ; return to shader part epilog
1871;
1872; GFX10-W32-LABEL: test_loop_vcc:
1873; GFX10-W32:       ; %bb.0: ; %entry
1874; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
1875; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1876; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
1877; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1878; GFX10-W32-NEXT:    s_mov_b32 s1, s0
1879; GFX10-W32-NEXT:    s_mov_b32 s2, s0
1880; GFX10-W32-NEXT:    s_mov_b32 s3, s0
1881; GFX10-W32-NEXT:    s_mov_b32 s4, s0
1882; GFX10-W32-NEXT:    s_mov_b32 s5, s0
1883; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
1884; GFX10-W32-NEXT:    s_mov_b32 s6, s0
1885; GFX10-W32-NEXT:    s_mov_b32 s7, s0
1886; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1887; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1888; GFX10-W32-NEXT:    s_branch .LBB31_2
1889; GFX10-W32-NEXT:    .p2align 6
1890; GFX10-W32-NEXT:  .LBB31_1: ; %body
1891; GFX10-W32-NEXT:    ; in Loop: Header=BB31_2 Depth=1
1892; GFX10-W32-NEXT:    s_mov_b32 s1, s0
1893; GFX10-W32-NEXT:    s_mov_b32 s2, s0
1894; GFX10-W32-NEXT:    s_mov_b32 s3, s0
1895; GFX10-W32-NEXT:    s_mov_b32 s4, s0
1896; GFX10-W32-NEXT:    s_mov_b32 s5, s0
1897; GFX10-W32-NEXT:    s_mov_b32 s6, s0
1898; GFX10-W32-NEXT:    s_mov_b32 s7, s0
1899; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
1900; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1901; GFX10-W32-NEXT:    s_mov_b32 s1, 0
1902; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_4
1903; GFX10-W32-NEXT:  .LBB31_2: ; %loop
1904; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1905; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1906; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1907; GFX10-W32-NEXT:    v_mov_b32_e32 v7, v3
1908; GFX10-W32-NEXT:    v_mov_b32_e32 v6, v2
1909; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
1910; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
1911; GFX10-W32-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
1912; GFX10-W32-NEXT:    s_cbranch_vccz .LBB31_1
1913; GFX10-W32-NEXT:  ; %bb.3:
1914; GFX10-W32-NEXT:    s_mov_b32 s1, -1
1915; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1916; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
1917; GFX10-W32-NEXT:  .LBB31_4: ; %break
1918; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
1919; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1920; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
1921; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
1922; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v6
1923; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v7
1924; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1925; GFX10-W32-NEXT:    ; return to shader part epilog
1926entry:
1927  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
1928  br label %loop
1929
1930loop:
1931  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
1932  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
1933  %cc = fcmp ogt float %ctr.iv, 7.0
1934  br i1 %cc, label %break, label %body
1935
1936body:
1937  %c.iv0 = extractelement <4 x float> %c.iv, i32 0
1938  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
1939  %ctr.next = fadd float %ctr.iv, 2.0
1940  br label %loop
1941
1942break:
1943  ret <4 x float> %c.iv
1944}
1945
1946; Only intrinsic stores need exact execution -- other stores do not have
1947; externally visible effects and may require WQM for correctness.
1948; CHECK-LABEL: {{^}}test_alloca:
1949; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
1950; CHECK: s_wqm_b64 exec, exec
1951
1952; CHECK: s_and_b64 exec, exec, [[LIVE]]
1953; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
1954; CHECK: s_wqm_b64 exec, exec
1955; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
1956; CHECK: s_and_b64 exec, exec, [[LIVE]]
1957; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
1958; CHECK: s_wqm_b64 exec, exec
1959; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
1960
1961; CHECK: s_and_b64 exec, exec, [[LIVE]]
1962; CHECK: image_sample
1963; CHECK: buffer_store_dwordx4
1964define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
1965; GFX9-W64-LABEL: test_alloca:
1966; GFX9-W64:       ; %bb.0: ; %entry
1967; GFX9-W64-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1968; GFX9-W64-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1969; GFX9-W64-NEXT:    s_mov_b32 s10, -1
1970; GFX9-W64-NEXT:    s_mov_b32 s11, 0xe00000
1971; GFX9-W64-NEXT:    s_add_u32 s8, s8, s0
1972; GFX9-W64-NEXT:    s_addc_u32 s9, s9, 0
1973; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1974; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1975; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1976; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1977; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1978; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
1979; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1980; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1981; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
1982; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
1983; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1984; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 4
1985; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v2, 2, v0
1986; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
1987; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1988; GFX9-W64-NEXT:    s_mov_b32 s0, 0
1989; GFX9-W64-NEXT:    s_mov_b32 s1, s0
1990; GFX9-W64-NEXT:    s_mov_b32 s2, s0
1991; GFX9-W64-NEXT:    s_mov_b32 s3, s0
1992; GFX9-W64-NEXT:    s_mov_b32 s4, s0
1993; GFX9-W64-NEXT:    s_mov_b32 s5, s0
1994; GFX9-W64-NEXT:    s_mov_b32 s6, s0
1995; GFX9-W64-NEXT:    s_mov_b32 s7, s0
1996; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1997; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
1998; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1999; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2000; GFX9-W64-NEXT:    s_endpgm
2001;
2002; GFX10-W32-LABEL: test_alloca:
2003; GFX10-W32:       ; %bb.0: ; %entry
2004; GFX10-W32-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
2005; GFX10-W32-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
2006; GFX10-W32-NEXT:    s_mov_b32 s10, -1
2007; GFX10-W32-NEXT:    s_mov_b32 s11, 0x31c16000
2008; GFX10-W32-NEXT:    s_add_u32 s8, s8, s0
2009; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
2010; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2011; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2012; GFX10-W32-NEXT:    v_mov_b32_e32 v3, 1
2013; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 4
2014; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2015; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2016; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2017; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
2018; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2019; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2020; GFX10-W32-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 idxen
2021; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2022; GFX10-W32-NEXT:    buffer_load_dword v0, v2, s[8:11], 0 offen
2023; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2024; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2025; GFX10-W32-NEXT:    s_mov_b32 s1, s0
2026; GFX10-W32-NEXT:    s_mov_b32 s2, s0
2027; GFX10-W32-NEXT:    s_mov_b32 s3, s0
2028; GFX10-W32-NEXT:    s_mov_b32 s4, s0
2029; GFX10-W32-NEXT:    s_mov_b32 s5, s0
2030; GFX10-W32-NEXT:    s_mov_b32 s6, s0
2031; GFX10-W32-NEXT:    s_mov_b32 s7, s0
2032; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2033; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2034; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2035; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2036; GFX10-W32-NEXT:    s_endpgm
2037entry:
2038  %array = alloca [32 x i32], align 4, addrspace(5)
2039
2040  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
2041
2042  %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
2043  store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
2044
2045  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
2046
2047  %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
2048  %c = load i32, i32 addrspace(5)* %c.gep, align 4
2049  %c.bc = bitcast i32 %c to float
2050  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2051  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
2052
2053  ret void
2054}
2055
2056; Must return to exact at the end of a non-void returning shader,
2057; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2058; even if the shader has no kills, because a kill could have happened in a
2059; previous shader fragment.
2060; CHECK-LABEL: {{^}}test_nonvoid_return:
2061; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
2062; CHECK: s_wqm_b64 exec, exec
2063; CHECK: s_and_b64 exec, exec, [[LIVE]]
2064; CHECK-NOT: exec
2065define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2066; GFX9-W64-LABEL: test_nonvoid_return:
2067; GFX9-W64:       ; %bb.0:
2068; GFX9-W64-NEXT:    s_mov_b32 s0, 0
2069; GFX9-W64-NEXT:    s_mov_b64 s[8:9], exec
2070; GFX9-W64-NEXT:    s_mov_b32 s1, s0
2071; GFX9-W64-NEXT:    s_mov_b32 s2, s0
2072; GFX9-W64-NEXT:    s_mov_b32 s3, s0
2073; GFX9-W64-NEXT:    s_mov_b32 s4, s0
2074; GFX9-W64-NEXT:    s_mov_b32 s5, s0
2075; GFX9-W64-NEXT:    s_mov_b32 s6, s0
2076; GFX9-W64-NEXT:    s_mov_b32 s7, s0
2077; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2078; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2079; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[8:9]
2080; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2081; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2082; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2083; GFX9-W64-NEXT:    ; return to shader part epilog
2084;
2085; GFX10-W32-LABEL: test_nonvoid_return:
2086; GFX10-W32:       ; %bb.0:
2087; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2088; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
2089; GFX10-W32-NEXT:    s_mov_b32 s1, s0
2090; GFX10-W32-NEXT:    s_mov_b32 s2, s0
2091; GFX10-W32-NEXT:    s_mov_b32 s3, s0
2092; GFX10-W32-NEXT:    s_mov_b32 s4, s0
2093; GFX10-W32-NEXT:    s_mov_b32 s5, s0
2094; GFX10-W32-NEXT:    s_mov_b32 s6, s0
2095; GFX10-W32-NEXT:    s_mov_b32 s7, s0
2096; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2097; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2098; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
2099; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2100; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2101; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2102; GFX10-W32-NEXT:    ; return to shader part epilog
2103  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2104  %tex0 = extractelement <4 x float> %tex, i32 0
2105  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2106  ret <4 x float> %dtex
2107}
2108
2109; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
2110; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
2111; CHECK: s_wqm_b64 exec, exec
2112; CHECK: s_and_b64 exec, exec, [[LIVE]]
2113; CHECK-NOT: exec
2114define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2115; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2116; GFX9-W64:       ; %bb.0: ; %entry
2117; GFX9-W64-NEXT:    s_mov_b32 s4, 0
2118; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2119; GFX9-W64-NEXT:    s_mov_b32 s5, s4
2120; GFX9-W64-NEXT:    s_mov_b32 s6, s4
2121; GFX9-W64-NEXT:    s_mov_b32 s7, s4
2122; GFX9-W64-NEXT:    s_mov_b32 s8, s4
2123; GFX9-W64-NEXT:    s_mov_b32 s9, s4
2124; GFX9-W64-NEXT:    s_mov_b32 s10, s4
2125; GFX9-W64-NEXT:    s_mov_b32 s11, s4
2126; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2127; GFX9-W64-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1
2128; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2129; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2130; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
2131; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2132; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB34_2
2133; GFX9-W64-NEXT:  ; %bb.1: ; %else
2134; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2135; GFX9-W64-NEXT:    s_branch .LBB34_3
2136; GFX9-W64-NEXT:  .LBB34_2: ; %if
2137; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2138; GFX9-W64-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2139; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2140; GFX9-W64-NEXT:  .LBB34_3:
2141;
2142; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2143; GFX10-W32:       ; %bb.0: ; %entry
2144; GFX10-W32-NEXT:    s_mov_b32 s4, 0
2145; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2146; GFX10-W32-NEXT:    s_mov_b32 s5, s4
2147; GFX10-W32-NEXT:    s_mov_b32 s6, s4
2148; GFX10-W32-NEXT:    s_mov_b32 s7, s4
2149; GFX10-W32-NEXT:    s_mov_b32 s8, s4
2150; GFX10-W32-NEXT:    s_mov_b32 s9, s4
2151; GFX10-W32-NEXT:    s_mov_b32 s10, s4
2152; GFX10-W32-NEXT:    s_mov_b32 s11, s4
2153; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2154; GFX10-W32-NEXT:    image_sample v0, v0, s[4:11], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2155; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
2156; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2157; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2158; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2159; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB34_2
2160; GFX10-W32-NEXT:  ; %bb.1: ; %else
2161; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2162; GFX10-W32-NEXT:    s_branch .LBB34_3
2163; GFX10-W32-NEXT:  .LBB34_2: ; %if
2164; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2165; GFX10-W32-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2166; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2167; GFX10-W32-NEXT:  .LBB34_3:
2168entry:
2169  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2170  %tex0 = extractelement <4 x float> %tex, i32 0
2171  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2172  %cc = icmp sgt i32 %c, 0
2173  br i1 %cc, label %if, label %else
2174
2175if:
2176  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
2177  unreachable
2178
2179else:
2180  ret <4 x float> %dtex
2181}
2182
2183; Test awareness that s_wqm_b64 clobbers SCC.
2184; CHECK-LABEL: {{^}}test_scc:
2185; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
2186; CHECK: s_wqm_b64 exec, exec
2187; CHECK: s_cmp_
2188; CHECK-NEXT: s_cbranch_scc
2189; CHECK: ; %else
2190; CHECK: image_sample
2191; CHECK: ; %if
2192; CHECK: image_sample
2193; CHECK: ; %end
2194; CHECK: s_and_b64 exec, exec, [[ORIG]]
2195define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2196; GFX9-W64-LABEL: test_scc:
2197; GFX9-W64:       ; %bb.0: ; %main_body
2198; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2199; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
2200; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2201; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2202; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB35_2
2203; GFX9-W64-NEXT:  ; %bb.1: ; %else
2204; GFX9-W64-NEXT:    s_mov_b32 s4, 0
2205; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2206; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
2207; GFX9-W64-NEXT:    s_mov_b32 s5, s4
2208; GFX9-W64-NEXT:    s_mov_b32 s6, s4
2209; GFX9-W64-NEXT:    s_mov_b32 s7, s4
2210; GFX9-W64-NEXT:    s_mov_b32 s8, s4
2211; GFX9-W64-NEXT:    s_mov_b32 s9, s4
2212; GFX9-W64-NEXT:    s_mov_b32 s10, s4
2213; GFX9-W64-NEXT:    s_mov_b32 s11, s4
2214; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[4:11], s[0:3] dmask:0xf
2215; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_3
2216; GFX9-W64-NEXT:    s_branch .LBB35_4
2217; GFX9-W64-NEXT:  .LBB35_2:
2218; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2219; GFX9-W64-NEXT:  .LBB35_3: ; %if
2220; GFX9-W64-NEXT:    s_mov_b32 s4, 0
2221; GFX9-W64-NEXT:    s_mov_b32 s5, s4
2222; GFX9-W64-NEXT:    s_mov_b32 s6, s4
2223; GFX9-W64-NEXT:    s_mov_b32 s7, s4
2224; GFX9-W64-NEXT:    s_mov_b32 s8, s4
2225; GFX9-W64-NEXT:    s_mov_b32 s9, s4
2226; GFX9-W64-NEXT:    s_mov_b32 s10, s4
2227; GFX9-W64-NEXT:    s_mov_b32 s11, s4
2228; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2229; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2230; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[4:11], s[0:3] dmask:0xf
2231; GFX9-W64-NEXT:  .LBB35_4: ; %end
2232; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2233; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
2234; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2235; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2236; GFX9-W64-NEXT:    ; return to shader part epilog
2237;
2238; GFX10-W32-LABEL: test_scc:
2239; GFX10-W32:       ; %bb.0: ; %main_body
2240; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
2241; GFX10-W32-NEXT:    s_mov_b32 s8, exec_lo
2242; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2243; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2244; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB35_2
2245; GFX10-W32-NEXT:  ; %bb.1: ; %else
2246; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2247; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
2248; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2249; GFX10-W32-NEXT:    s_mov_b32 s1, s0
2250; GFX10-W32-NEXT:    s_mov_b32 s2, s0
2251; GFX10-W32-NEXT:    s_mov_b32 s3, s0
2252; GFX10-W32-NEXT:    s_mov_b32 s4, s0
2253; GFX10-W32-NEXT:    s_mov_b32 s5, s0
2254; GFX10-W32-NEXT:    s_mov_b32 s6, s0
2255; GFX10-W32-NEXT:    s_mov_b32 s7, s0
2256; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2257; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_3
2258; GFX10-W32-NEXT:    s_branch .LBB35_4
2259; GFX10-W32-NEXT:  .LBB35_2:
2260; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2261; GFX10-W32-NEXT:  .LBB35_3: ; %if
2262; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2263; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2264; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2265; GFX10-W32-NEXT:    s_mov_b32 s1, s0
2266; GFX10-W32-NEXT:    s_mov_b32 s2, s0
2267; GFX10-W32-NEXT:    s_mov_b32 s3, s0
2268; GFX10-W32-NEXT:    s_mov_b32 s4, s0
2269; GFX10-W32-NEXT:    s_mov_b32 s5, s0
2270; GFX10-W32-NEXT:    s_mov_b32 s6, s0
2271; GFX10-W32-NEXT:    s_mov_b32 s7, s0
2272; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2273; GFX10-W32-NEXT:  .LBB35_4: ; %end
2274; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s8
2275; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
2276; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2277; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2278; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2279; GFX10-W32-NEXT:    ; return to shader part epilog
2280main_body:
2281  %cc = icmp sgt i32 %sel, 0
2282  br i1 %cc, label %if, label %else
2283
2284if:
2285  %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2286  br label %end
2287
2288else:
2289  %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2290  br label %end
2291
2292end:
2293  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2294  call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2295  ret <4 x float> %r
2296}
2297
2298; Check a case of a block being entirely WQM except for a bit of WWM.
2299; There was a bug where it forgot to enter and leave WWM.
2300define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2301; GFX9-W64-LABEL: test_wwm_within_wqm:
2302; GFX9-W64:       ; %bb.0: ; %main_body
2303; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2304; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2305; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2306; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2307; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2308; GFX9-W64-NEXT:    s_cbranch_execz .LBB36_2
2309; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2310; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2311; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2312; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2313; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2314; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v0
2315; GFX9-W64-NEXT:    s_not_b64 exec, exec
2316; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2317; GFX9-W64-NEXT:    s_not_b64 exec, exec
2318; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2319; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2320; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2321; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2322; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2323; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2324; GFX9-W64-NEXT:  .LBB36_2: ; %ENDIF
2325; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2326; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2327; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2328; GFX9-W64-NEXT:    ; return to shader part epilog
2329;
2330; GFX10-W32-LABEL: test_wwm_within_wqm:
2331; GFX10-W32:       ; %bb.0: ; %main_body
2332; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2333; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2334; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2335; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2336; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2337; GFX10-W32-NEXT:    s_cbranch_execz .LBB36_2
2338; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2339; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2340; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2341; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2342; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2343; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v0
2344; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2345; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2346; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2347; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2348; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2349; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2350; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2351; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2352; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2353; GFX10-W32-NEXT:  .LBB36_2: ; %ENDIF
2354; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2355; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2356; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2357; GFX10-W32-NEXT:    ; return to shader part epilog
2358main_body:
2359  %c.bc = bitcast i32 %c to float
2360  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2361  %tex0 = extractelement <4 x float> %tex, i32 0
2362  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2363  %cmp = icmp eq i32 %z, 0
2364  br i1 %cmp, label %IF, label %ENDIF
2365
2366IF:
2367  %dataf = extractelement <4 x float> %dtex, i32 0
2368  %data1 = fptosi float %dataf to i32
2369  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2370  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2371  %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2372  %data4f = sitofp i32 %data4 to float
2373  br label %ENDIF
2374
2375ENDIF:
2376  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2377  ret float %r
2378}
2379
2380; Check that WWM is triggered by the strict_wwm intrinsic.
2381define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2382; GFX9-W64-LABEL: test_strict_wwm1:
2383; GFX9-W64:       ; %bb.0: ; %main_body
2384; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2385; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2386; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2387; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2388; GFX9-W64-NEXT:    s_nop 0
2389; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2390; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2391; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2392; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2393; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2394; GFX9-W64-NEXT:    ; return to shader part epilog
2395;
2396; GFX10-W32-LABEL: test_strict_wwm1:
2397; GFX10-W32:       ; %bb.0: ; %main_body
2398; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2399; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2400; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2401; GFX10-W32-NEXT:    s_clause 0x1
2402; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2403; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2404; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2405; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2406; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2407; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2408; GFX10-W32-NEXT:    ; return to shader part epilog
2409main_body:
2410  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2411  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2412  %out = fadd float %src0, %src1
2413  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2414  ret float %out.0
2415}
2416
2417; Same as above, but with an integer type.
2418define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2419; GFX9-W64-LABEL: test_strict_wwm2:
2420; GFX9-W64:       ; %bb.0: ; %main_body
2421; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2422; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2423; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2424; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2425; GFX9-W64-NEXT:    s_nop 0
2426; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2427; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2428; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
2429; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2430; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2431; GFX9-W64-NEXT:    ; return to shader part epilog
2432;
2433; GFX10-W32-LABEL: test_strict_wwm2:
2434; GFX10-W32:       ; %bb.0: ; %main_body
2435; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2436; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2437; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2438; GFX10-W32-NEXT:    s_clause 0x1
2439; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2440; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2441; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2442; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2443; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2444; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2445; GFX10-W32-NEXT:    ; return to shader part epilog
2446main_body:
2447  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2448  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2449  %src0.0 = bitcast float %src0 to i32
2450  %src1.0 = bitcast float %src1 to i32
2451  %out = add i32 %src0.0, %src1.0
2452  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2453  %out.1 = bitcast i32 %out.0 to float
2454  ret float %out.1
2455}
2456
2457; Check that we don't leave WWM on for computations that don't require WWM,
2458; since that will lead clobbering things that aren't supposed to be clobbered
2459; in cases like this.
2460; We enforce this by checking that v_add gets emitted in the same block as
2461; WWM computations.
2462define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2463; GFX9-W64-LABEL: test_strict_wwm3:
2464; GFX9-W64:       ; %bb.0: ; %main_body
2465; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2466; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2467; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
2468; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2469; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2470; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_2
2471; GFX9-W64-NEXT:  ; %bb.1: ; %if
2472; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2473; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2474; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2475; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2476; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
2477; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2478; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2479; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
2480; GFX9-W64-NEXT:  .LBB39_2: ; %endif
2481; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2482; GFX9-W64-NEXT:    ; return to shader part epilog
2483;
2484; GFX10-W32-LABEL: test_strict_wwm3:
2485; GFX10-W32:       ; %bb.0: ; %main_body
2486; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2487; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2488; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
2489; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2490; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2491; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_2
2492; GFX10-W32-NEXT:  ; %bb.1: ; %if
2493; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2494; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2495; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2496; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2497; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
2498; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2499; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2500; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
2501; GFX10-W32-NEXT:  .LBB39_2: ; %endif
2502; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2503; GFX10-W32-NEXT:    ; return to shader part epilog
2504main_body:
2505  ; use mbcnt to make sure the branch is divergent
2506  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2507  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2508  %cc = icmp uge i32 %hi, 32
2509  br i1 %cc, label %endif, label %if
2510
2511if:
2512  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2513  %out = fadd float %src, %src
2514  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2515  %out.1 = fadd float %src, %out.0
2516  br label %endif
2517
2518endif:
2519  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2520  ret float %out.2
2521}
2522
2523; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2524; write could clobber disabled channels in the non-WWM one.
2525; We enforce this by checking that v_mov gets emitted in the same block as
2526; WWM computations.
2527define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2528; GFX9-W64-LABEL: test_strict_wwm4:
2529; GFX9-W64:       ; %bb.0: ; %main_body
2530; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2531; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2532; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
2533; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2534; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2535; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
2536; GFX9-W64-NEXT:  ; %bb.1: ; %if
2537; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2538; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2539; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2540; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2541; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2542; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2543; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2544; GFX9-W64-NEXT:  .LBB40_2: ; %endif
2545; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2546; GFX9-W64-NEXT:    ; return to shader part epilog
2547;
2548; GFX10-W32-LABEL: test_strict_wwm4:
2549; GFX10-W32:       ; %bb.0: ; %main_body
2550; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2551; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2552; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
2553; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2554; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2555; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
2556; GFX10-W32-NEXT:  ; %bb.1: ; %if
2557; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2558; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2559; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2560; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2561; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2562; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2563; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2564; GFX10-W32-NEXT:  .LBB40_2: ; %endif
2565; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2566; GFX10-W32-NEXT:    ; return to shader part epilog
2567main_body:
2568  ; use mbcnt to make sure the branch is divergent
2569  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2570  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2571  %cc = icmp uge i32 %hi, 32
2572  br i1 %cc, label %endif, label %if
2573
2574if:
2575  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2576  %out = fadd float %src, %src
2577  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2578  br label %endif
2579
2580endif:
2581  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2582  ret float %out.1
2583}
2584
2585; Make sure the transition from Exact to WWM then WQM works properly.
2586define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2587; GFX9-W64-LABEL: test_strict_wwm5:
2588; GFX9-W64:       ; %bb.0: ; %main_body
2589; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2590; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
2591; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2592; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2593; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2594; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2595; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2596; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2597; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2598; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2599; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2600; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2601; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2602; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2603; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
2604; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2605; GFX9-W64-NEXT:    ; return to shader part epilog
2606;
2607; GFX10-W32-LABEL: test_strict_wwm5:
2608; GFX10-W32:       ; %bb.0: ; %main_body
2609; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
2610; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
2611; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2612; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2613; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2614; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2615; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2616; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2617; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2618; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2619; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2620; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2621; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2622; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2623; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2624; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2625; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
2626; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2627; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2628; GFX10-W32-NEXT:    ; return to shader part epilog
2629main_body:
2630  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2631  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2632  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2633  %temp = fadd float %src1, %src1
2634  %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2635  %out = fadd float %temp.0, %temp.0
2636  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2637  ret float %out.0
2638}
2639
2640; Check that WWM is turned on correctly across basic block boundaries.
2641; if..then..endif version
2642;SI-CHECK: buffer_load_dword
2643;VI-CHECK: flat_load_dword
2644;SI-CHECK: buffer_load_dword
2645;VI-CHECK: flat_load_dword
2646define amdgpu_ps float @test_strict_wwm6_then() {
2647; GFX9-W64-LABEL: test_strict_wwm6_then:
2648; GFX9-W64:       ; %bb.0: ; %main_body
2649; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2650; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2651; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2652; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2653; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2654; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2655; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v0
2656; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2657; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2658; GFX9-W64-NEXT:    s_cbranch_execz .LBB42_2
2659; GFX9-W64-NEXT:  ; %bb.1: ; %if
2660; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2661; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2662; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2663; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2664; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2665; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2666; GFX9-W64-NEXT:  .LBB42_2: ; %endif
2667; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2668; GFX9-W64-NEXT:    ; return to shader part epilog
2669;
2670; GFX10-W32-LABEL: test_strict_wwm6_then:
2671; GFX10-W32:       ; %bb.0: ; %main_body
2672; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2673; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2674; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2675; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2676; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2677; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2678; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 32, v0
2679; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2680; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2681; GFX10-W32-NEXT:    s_cbranch_execz .LBB42_2
2682; GFX10-W32-NEXT:  ; %bb.1: ; %if
2683; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2684; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2685; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2686; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2687; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2688; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2689; GFX10-W32-NEXT:  .LBB42_2: ; %endif
2690; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2691; GFX10-W32-NEXT:    ; return to shader part epilog
2692main_body:
2693  %src0 = load volatile float, float addrspace(1)* undef
2694  ; use mbcnt to make sure the branch is divergent
2695  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2696  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2697  %cc = icmp uge i32 %hi, 32
2698  br i1 %cc, label %endif, label %if
2699
2700if:
2701  %src1 = load volatile float, float addrspace(1)* undef
2702  %out = fadd float %src0, %src1
2703  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2704  br label %endif
2705
2706endif:
2707  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2708  ret float %out.1
2709}
2710
2711; Check that WWM is turned on correctly across basic block boundaries.
2712; loop version
2713define amdgpu_ps float @test_strict_wwm6_loop() {
2714; GFX9-W64-LABEL: test_strict_wwm6_loop:
2715; GFX9-W64:       ; %bb.0: ; %main_body
2716; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2717; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2718; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2719; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2720; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
2721; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
2722; GFX9-W64-NEXT:  .LBB43_1: ; %loop
2723; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2724; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2725; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2726; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2727; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2728; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
2729; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2730; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2731; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
2732; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2733; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2734; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2735; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2736; GFX9-W64-NEXT:    s_cbranch_execnz .LBB43_1
2737; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
2738; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2739; GFX9-W64-NEXT:    ; return to shader part epilog
2740;
2741; GFX10-W32-LABEL: test_strict_wwm6_loop:
2742; GFX10-W32:       ; %bb.0: ; %main_body
2743; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2744; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2745; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2746; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2747; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v3, -1, 0
2748; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2749; GFX10-W32-NEXT:  .LBB43_1: ; %loop
2750; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2751; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2752; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2753; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2754; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2755; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
2756; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2757; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
2758; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2759; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2760; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2761; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
2762; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
2763; GFX10-W32-NEXT:    s_cbranch_execnz .LBB43_1
2764; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
2765; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2766; GFX10-W32-NEXT:    ; return to shader part epilog
2767main_body:
2768  %src0 = load volatile float, float addrspace(1)* undef
2769  ; use mbcnt to make sure the branch is divergent
2770  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2771  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2772  br label %loop
2773
2774loop:
2775  %counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
2776  %src1 = load volatile float, float addrspace(1)* undef
2777  %out = fadd float %src0, %src1
2778  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2779  %counter.1 = sub i32 %counter, 1
2780  %cc = icmp ne i32 %counter.1, 0
2781  br i1 %cc, label %loop, label %endloop
2782
2783endloop:
2784  ret float %out.0
2785}
2786
2787; Check that @llvm.amdgcn.set.inactive disables WWM.
2788define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2789; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2790; GFX9-W64:       ; %bb.0: ; %main_body
2791; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2792; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
2793; GFX9-W64-NEXT:    s_not_b64 exec, exec
2794; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2795; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2796; GFX9-W64-NEXT:    s_not_b64 exec, exec
2797; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2798; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
2799; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2800; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2801; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2802; GFX9-W64-NEXT:    s_endpgm
2803;
2804; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2805; GFX10-W32:       ; %bb.0: ; %main_body
2806; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2807; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
2808; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2809; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2810; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2811; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2812; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2813; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
2814; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2815; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2816; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2817; GFX10-W32-NEXT:    s_endpgm
2818main_body:
2819  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2820  %src.0 = bitcast float %src to i32
2821  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2822  %out = add i32 %src.1, %src.1
2823  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2824  %out.1 = bitcast i32 %out.0 to float
2825  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2826  ret void
2827}
2828
2829; Check a case of a block being entirely WQM except for a bit of WWM.
2830; There was a bug where it forgot to enter and leave WWM.
2831define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2832; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2833; GFX9-W64:       ; %bb.0: ; %main_body
2834; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2835; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2836; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2837; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2838; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2839; GFX9-W64-NEXT:    s_cbranch_execz .LBB45_2
2840; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2841; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2842; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2843; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2844; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2845; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v0
2846; GFX9-W64-NEXT:    s_not_b64 exec, exec
2847; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2848; GFX9-W64-NEXT:    s_not_b64 exec, exec
2849; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2850; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2851; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2852; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2853; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2854; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2855; GFX9-W64-NEXT:  .LBB45_2: ; %ENDIF
2856; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2857; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2858; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2859; GFX9-W64-NEXT:    ; return to shader part epilog
2860;
2861; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2862; GFX10-W32:       ; %bb.0: ; %main_body
2863; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2864; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2865; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2866; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2867; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2868; GFX10-W32-NEXT:    s_cbranch_execz .LBB45_2
2869; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2870; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2871; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2872; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2873; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2874; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v0
2875; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2876; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2877; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2878; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2879; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2880; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2881; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2882; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2883; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2884; GFX10-W32-NEXT:  .LBB45_2: ; %ENDIF
2885; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2886; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2887; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2888; GFX10-W32-NEXT:    ; return to shader part epilog
2889main_body:
2890  %c.bc = bitcast i32 %c to float
2891  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2892  %tex0 = extractelement <4 x float> %tex, i32 0
2893  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2894  %cmp = icmp eq i32 %z, 0
2895  br i1 %cmp, label %IF, label %ENDIF
2896
2897IF:
2898  %dataf = extractelement <4 x float> %dtex, i32 0
2899  %data1 = fptosi float %dataf to i32
2900  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2901  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2902  %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2903  %data4f = sitofp i32 %data4 to float
2904  br label %ENDIF
2905
2906ENDIF:
2907  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2908  ret float %r
2909}
2910
2911; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2912define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2913; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2914; GFX9-W64:       ; %bb.0: ; %main_body
2915; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2916; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2917; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2918; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2919; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2920; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2921; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2922; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2923; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2924; GFX9-W64-NEXT:    s_cbranch_execz .LBB46_2
2925; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2926; GFX9-W64-NEXT:    s_mov_b64 s[16:17], exec
2927; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2928; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2929; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2930; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2931; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2932; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
2933; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2934; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
2935; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2936; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2937; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
2938; GFX9-W64-NEXT:  .LBB46_2: ; %ENDIF
2939; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2940; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2941; GFX9-W64-NEXT:    ; return to shader part epilog
2942;
2943; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2944; GFX10-W32:       ; %bb.0: ; %main_body
2945; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2946; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2947; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2948; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2949; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2950; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2951; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2952; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2953; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2954; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
2955; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2956; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
2957; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2958; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2959; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2960; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2961; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2962; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
2963; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2964; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
2965; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2966; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2967; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
2968; GFX10-W32-NEXT:  .LBB46_2: ; %ENDIF
2969; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2970; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2971; GFX10-W32-NEXT:    ; return to shader part epilog
2972main_body:
2973  %c.bc = bitcast i32 %c to float
2974  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2975  %tex0 = extractelement <4 x float> %tex, i32 0
2976  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2977  %cmp = icmp eq i32 %z, 0
2978  br i1 %cmp, label %IF, label %ENDIF
2979
2980IF:
2981  %dataf = extractelement <4 x float> %dtex, i32 0
2982  %data1 = fptosi float %dataf to i32
2983  %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2984  %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2985  %data3f = sitofp i32 %data3 to float
2986  br label %ENDIF
2987
2988ENDIF:
2989  %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2990  ret float %r
2991}
2992
2993;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
2994define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) {
2995; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
2996; GFX9-W64:       ; %bb.0: ; %main_body
2997; GFX9-W64-NEXT:    s_mov_b64 s[28:29], exec
2998; GFX9-W64-NEXT:    s_mov_b32 s19, s17
2999; GFX9-W64-NEXT:    s_mov_b64 s[30:31], exec
3000; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3001; GFX9-W64-NEXT:    s_mov_b32 s23, s5
3002; GFX9-W64-NEXT:    s_mov_b32 s22, s4
3003; GFX9-W64-NEXT:    s_mov_b32 s21, s3
3004; GFX9-W64-NEXT:    s_mov_b32 s20, s2
3005; GFX9-W64-NEXT:    s_mov_b32 s27, s9
3006; GFX9-W64-NEXT:    s_mov_b32 s26, s8
3007; GFX9-W64-NEXT:    s_mov_b32 s25, s7
3008; GFX9-W64-NEXT:    s_mov_b32 s24, s6
3009; GFX9-W64-NEXT:    s_mov_b32 s18, s16
3010; GFX9-W64-NEXT:    s_mov_b32 s17, s15
3011; GFX9-W64-NEXT:    s_mov_b32 s16, s14
3012; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3013; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3014; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3015; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3016; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
3017; GFX9-W64-NEXT:    s_mov_b64 exec, s[30:31]
3018; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3019; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
3020; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3021; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3022; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3023; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3024; GFX9-W64-NEXT:    v_mov_b32_e32 v3, s0
3025; GFX9-W64-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3026; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3027; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3028; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3029; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3030; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3031; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3032; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3033; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3034; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3035; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3036; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3037; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3038; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[28:29]
3039; GFX9-W64-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
3040; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3041; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3042; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3043; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3044; GFX9-W64-NEXT:    ; return to shader part epilog
3045;
3046; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
3047; GFX10-W32:       ; %bb.0: ; %main_body
3048; GFX10-W32-NEXT:    s_mov_b32 s28, exec_lo
3049; GFX10-W32-NEXT:    s_mov_b32 s19, s17
3050; GFX10-W32-NEXT:    s_mov_b32 s29, exec_lo
3051; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3052; GFX10-W32-NEXT:    s_mov_b32 s23, s5
3053; GFX10-W32-NEXT:    s_mov_b32 s22, s4
3054; GFX10-W32-NEXT:    s_mov_b32 s21, s3
3055; GFX10-W32-NEXT:    s_mov_b32 s20, s2
3056; GFX10-W32-NEXT:    s_mov_b32 s27, s9
3057; GFX10-W32-NEXT:    s_mov_b32 s26, s8
3058; GFX10-W32-NEXT:    s_mov_b32 s25, s7
3059; GFX10-W32-NEXT:    s_mov_b32 s24, s6
3060; GFX10-W32-NEXT:    s_mov_b32 s18, s16
3061; GFX10-W32-NEXT:    s_mov_b32 s17, s15
3062; GFX10-W32-NEXT:    s_mov_b32 s16, s14
3063; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3064; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3065; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3066; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3067; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
3068; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s29
3069; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3070; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
3071; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3072; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
3073; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3074; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
3075; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s0
3076; GFX10-W32-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
3077; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
3078; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3079; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3080; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3081; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3082; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3083; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3084; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3085; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3086; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3087; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3088; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3089; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
3090; GFX10-W32-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
3091; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3092; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
3093; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3094; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3095; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3096; GFX10-W32-NEXT:    ; return to shader part epilog
3097main_body:
3098  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3099  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3100  %temp = fadd float %reload, %reload
3101  %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3102  %temp3 = fadd float %temp2, %temp2
3103  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0)
3104  %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3105  %temp5 = fadd float %temp3, %temp4
3106  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0)
3107  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3108  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3109  ret float %out
3110}
3111
3112define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
3113; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3114; GFX9-W64:       ; %bb.0: ; %main_body
3115; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3116; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3117; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3118; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3119; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3120; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3121; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3122; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3123; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3124; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3125; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3126; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3127; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3128; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3129; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3130; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3131; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3132; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3133; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3134; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
3135; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3136; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3137; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3138; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3139; GFX9-W64-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3140; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3141; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
3142; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3143; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3144; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3145; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3146; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3147; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3148; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3149; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3150; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3151; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3152; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3153; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3154; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3155; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3156; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3157; GFX9-W64-NEXT:    ; return to shader part epilog
3158;
3159; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3160; GFX10-W32:       ; %bb.0: ; %main_body
3161; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3162; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3163; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3164; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3165; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3166; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3167; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3168; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3169; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3170; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3171; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3172; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3173; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3174; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3175; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3176; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3177; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3178; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3179; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
3180; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3181; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3182; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3183; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3184; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3185; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3186; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3187; GFX10-W32-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3188; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3189; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3190; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3191; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3192; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3193; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3194; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3195; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3196; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3197; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3198; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3199; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3200; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3201; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3202; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3203; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3204; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3205; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3206; GFX10-W32-NEXT:    ; return to shader part epilog
3207main_body:
3208  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3209  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3210  %temp = fadd float %reload, %reload
3211  %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3212  %temp3 = fadd float %temp2, %temp2
3213  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3214  %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3215  %temp5 = fadd float %temp3, %temp4
3216  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3217  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3218  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3219  ret float %out
3220}
3221
3222;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3223define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
3224; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3225; GFX9-W64:       ; %bb.0: ; %main_body
3226; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3227; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3228; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3229; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3230; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3231; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3232; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3233; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3234; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3235; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3236; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3237; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3238; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3239; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3240; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3241; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3242; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3243; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3244; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3245; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
3246; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
3247; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3248; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3249; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3250; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3251; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3252; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3253; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3254; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3255; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
3256; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3257; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3258; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
3259; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3260; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3261; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3262; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3263; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3264; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3265; GFX9-W64-NEXT:    ; return to shader part epilog
3266;
3267; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3268; GFX10-W32:       ; %bb.0: ; %main_body
3269; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3270; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3271; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3272; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3273; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3274; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3275; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3276; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3277; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3278; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3279; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3280; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3281; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3282; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3283; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3284; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3285; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3286; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3287; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s1
3288; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3289; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3290; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3291; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
3292; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3293; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3294; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3295; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3296; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3297; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3298; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3299; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
3300; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3301; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3302; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3303; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
3304; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3305; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3306; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3307; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3308; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3309; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3310; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3311; GFX10-W32-NEXT:    ; return to shader part epilog
3312main_body:
3313  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3314  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3315  %temp = fadd float %reload, %reload
3316  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3317  %temp2 = fadd float %tex, %tex
3318  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3319  %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3320  %temp4 = fadd float %temp2, %temp3
3321  %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3322  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3323  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3324  ret float %out
3325}
3326
3327declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3328declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3329
3330declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3331declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3332declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3333declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3334declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3335declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3336
3337declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3338declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3339declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3340declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3341declare void @llvm.amdgcn.kill(i1) #1
3342declare float @llvm.amdgcn.wqm.f32(float) #3
3343declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3344declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3345declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3346declare float @llvm.amdgcn.wwm.f32(float) #3
3347declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3348declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3349declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3350declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3351declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3352declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3353declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3354declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3355declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3356declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3357declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3358
3359attributes #1 = { nounwind }
3360attributes #2 = { nounwind readonly }
3361attributes #3 = { nounwind readnone }
3362attributes #4 = { nounwind readnone convergent }
3363attributes #5 = { "amdgpu-ps-wqm-outputs" }
3364attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3365