1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
3; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
4
5; Check that WQM isn't triggered by image load/store intrinsics.
6define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
7; GFX9-W64-LABEL: test1:
8; GFX9-W64:       ; %bb.0: ; %main_body
9; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
10; GFX9-W64-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm
11; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
12; GFX9-W64-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf unorm
13; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
14; GFX9-W64-NEXT:    ; return to shader part epilog
15;
16; GFX10-W32-LABEL: test1:
17; GFX10-W32:       ; %bb.0: ; %main_body
18; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
19; GFX10-W32-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
20; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
21; GFX10-W32-NEXT:    image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
22; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
23; GFX10-W32-NEXT:    ; return to shader part epilog
24main_body:
25  %tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
26  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
27  ret <4 x float> %tex
28}
29
30; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
31define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
32; GFX9-W64-LABEL: test2:
33; GFX9-W64:       ; %bb.0: ; %main_body
34; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
35; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
36; GFX9-W64-NEXT:    s_mov_b32 m0, s3
37; GFX9-W64-NEXT:    s_nop 0
38; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
39; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
40; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
41; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
42; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
43; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
44; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
45; GFX9-W64-NEXT:    ; return to shader part epilog
46;
47; GFX10-W32-LABEL: test2:
48; GFX10-W32:       ; %bb.0: ; %main_body
49; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
50; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
51; GFX10-W32-NEXT:    s_mov_b32 m0, s3
52; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
53; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
54; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
55; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
56; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
57; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
58; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
59; GFX10-W32-NEXT:    ; return to shader part epilog
60main_body:
61  %inst23 = extractelement <2 x float> %pos, i32 0
62  %inst24 = extractelement <2 x float> %pos, i32 1
63  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
64  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
65  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
66  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
67  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
68  ret <4 x float> %tex
69}
70
71; ... but disabled for stores (and, in this simple case, not re-enabled) ...
72define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
73; GFX9-W64-LABEL: test3:
74; GFX9-W64:       ; %bb.0: ; %main_body
75; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
76; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
77; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
78; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
79; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
80; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
81; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
82; GFX9-W64-NEXT:    ; return to shader part epilog
83;
84; GFX10-W32-LABEL: test3:
85; GFX10-W32:       ; %bb.0: ; %main_body
86; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
87; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
88; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
89; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
90; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
91; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
92; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
93; GFX10-W32-NEXT:    ; return to shader part epilog
94main_body:
95  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
96  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
97  %tex.2 = extractelement <4 x i32> %tex.1, i32 0
98
99  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
100
101  ret <4 x float> %tex
102}
103
104; ... and disabled for export.
105define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
106; GFX9-W64-LABEL: test3x:
107; GFX9-W64:       ; %bb.0: ; %main_body
108; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
109; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
110; GFX9-W64-NEXT:    s_mov_b32 m0, s3
111; GFX9-W64-NEXT:    s_nop 0
112; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
113; GFX9-W64-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
114; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
115; GFX9-W64-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
116; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
117; GFX9-W64-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
118; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
119; GFX9-W64-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
120; GFX9-W64-NEXT:    s_endpgm
121;
122; GFX10-W32-LABEL: test3x:
123; GFX10-W32:       ; %bb.0: ; %main_body
124; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
125; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
126; GFX10-W32-NEXT:    s_mov_b32 m0, s3
127; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
128; GFX10-W32-NEXT:    v_interp_p1_f32_e32 v3, v0, attr0.y
129; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
130; GFX10-W32-NEXT:    v_interp_p2_f32_e32 v3, v1, attr0.y
131; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
132; GFX10-W32-NEXT:    image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
133; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
134; GFX10-W32-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
135; GFX10-W32-NEXT:    s_endpgm
136main_body:
137  %inst23 = extractelement <2 x float> %pos, i32 0
138  %inst24 = extractelement <2 x float> %pos, i32 1
139  %inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
140  %inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
141  %inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
142  %inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
143  %tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
144  %tex.0 = extractelement <4 x float> %tex, i32 0
145  %tex.1 = extractelement <4 x float> %tex, i32 1
146  %tex.2 = extractelement <4 x float> %tex, i32 2
147  %tex.3 = extractelement <4 x float> %tex, i32 3
148  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
149  ret void
150}
151
152; Check that WQM is re-enabled when required.
153define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) {
154; GFX9-W64-LABEL: test4:
155; GFX9-W64:       ; %bb.0: ; %main_body
156; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
157; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
158; GFX9-W64-NEXT:    v_mul_lo_u32 v4, v0, v1
159; GFX9-W64-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
160; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
161; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
162; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
163; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
164; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
165; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
166; GFX9-W64-NEXT:    ; return to shader part epilog
167;
168; GFX10-W32-LABEL: test4:
169; GFX10-W32:       ; %bb.0: ; %main_body
170; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
171; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
172; GFX10-W32-NEXT:    v_mul_lo_u32 v4, v0, v1
173; GFX10-W32-NEXT:    image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
174; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
175; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
176; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
177; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
178; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
179; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
180; GFX10-W32-NEXT:    ; return to shader part epilog
181main_body:
182  %c.1 = mul i32 %c, %d
183
184  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
185  %c.1.bc = bitcast i32 %c.1 to float
186  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
187  %tex0 = extractelement <4 x float> %tex, i32 0
188  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
189  ret <4 x float> %dtex
190}
191
192; Check that WQM is triggered by the wqm intrinsic.
193; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
194; does not happen - the v_add should write the return reg directly.
195define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
196; GFX9-W64-LABEL: test5:
197; GFX9-W64:       ; %bb.0: ; %main_body
198; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
199; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
200; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
201; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
202; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
203; GFX9-W64-NEXT:    s_nop 0
204; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
205; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
206; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
207; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
208; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
209; GFX9-W64-NEXT:    ; return to shader part epilog
210;
211; GFX10-W32-LABEL: test5:
212; GFX10-W32:       ; %bb.0: ; %main_body
213; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
214; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
215; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
216; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
217; GFX10-W32-NEXT:    s_clause 0x1
218; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
219; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
220; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
221; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
222; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
223; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
224; GFX10-W32-NEXT:    ; return to shader part epilog
225main_body:
226  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
227  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
228  %out = fadd float %src0, %src1
229  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
230  ret float %out.0
231}
232
233; Check that the wqm intrinsic works correctly for integers.
234define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
235; GFX9-W64-LABEL: test6:
236; GFX9-W64:       ; %bb.0: ; %main_body
237; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
238; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
239; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
240; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
241; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
242; GFX9-W64-NEXT:    s_nop 0
243; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
244; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
245; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
246; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
247; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
248; GFX9-W64-NEXT:    ; return to shader part epilog
249;
250; GFX10-W32-LABEL: test6:
251; GFX10-W32:       ; %bb.0: ; %main_body
252; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
253; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
254; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
255; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
256; GFX10-W32-NEXT:    s_clause 0x1
257; GFX10-W32-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 idxen
258; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
259; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
260; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
261; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
262; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
263; GFX10-W32-NEXT:    ; return to shader part epilog
264main_body:
265  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
266  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
267  %out = fadd float %src0, %src1
268  %out.0 = bitcast float %out to i32
269  %out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
270  %out.2 = bitcast i32 %out.1 to float
271  ret float %out.2
272}
273
274; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
275
276; Check that WWM is triggered by the wwm intrinsic.
277define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
278; GFX9-W64-LABEL: test_wwm1:
279; GFX9-W64:       ; %bb.0: ; %main_body
280; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
281; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
282; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
283; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
284; GFX9-W64-NEXT:    s_nop 0
285; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
286; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
287; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
288; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
289; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
290; GFX9-W64-NEXT:    ; return to shader part epilog
291;
292; GFX10-W32-LABEL: test_wwm1:
293; GFX10-W32:       ; %bb.0: ; %main_body
294; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
295; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
296; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
297; GFX10-W32-NEXT:    s_clause 0x1
298; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
299; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
300; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
301; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
302; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
303; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
304; GFX10-W32-NEXT:    ; return to shader part epilog
305main_body:
306  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
307  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
308  %out = fadd float %src0, %src1
309  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
310  ret float %out.0
311}
312
313; Same as above, but with an integer type.
314define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
315; GFX9-W64-LABEL: test_wwm2:
316; GFX9-W64:       ; %bb.0: ; %main_body
317; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
318; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
319; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
320; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
321; GFX9-W64-NEXT:    s_nop 0
322; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
323; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
324; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
325; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
326; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
327; GFX9-W64-NEXT:    ; return to shader part epilog
328;
329; GFX10-W32-LABEL: test_wwm2:
330; GFX10-W32:       ; %bb.0: ; %main_body
331; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
332; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
333; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
334; GFX10-W32-NEXT:    s_clause 0x1
335; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
336; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
337; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
338; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
339; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
340; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
341; GFX10-W32-NEXT:    ; return to shader part epilog
342main_body:
343  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
344  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
345  %src0.0 = bitcast float %src0 to i32
346  %src1.0 = bitcast float %src1 to i32
347  %out = add i32 %src0.0, %src1.0
348  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
349  %out.1 = bitcast i32 %out.0 to float
350  ret float %out.1
351}
352
353; Check that we don't leave WWM on for computations that don't require WWM,
354; since that will lead clobbering things that aren't supposed to be clobbered
355; in cases like this.
356; We enforce this by checking that v_add gets emitted in the same block as
357; WWM computations.
358define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
359; GFX9-W64-LABEL: test_wwm3:
360; GFX9-W64:       ; %bb.0: ; %main_body
361; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
362; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
363; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
364; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
365; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
366; GFX9-W64-NEXT:    s_cbranch_execz .LBB9_2
367; GFX9-W64-NEXT:  ; %bb.1: ; %if
368; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
369; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
370; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
371; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
372; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
373; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
374; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
375; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
376; GFX9-W64-NEXT:  .LBB9_2: ; %endif
377; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
378; GFX9-W64-NEXT:    ; return to shader part epilog
379;
380; GFX10-W32-LABEL: test_wwm3:
381; GFX10-W32:       ; %bb.0: ; %main_body
382; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
383; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
384; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
385; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
386; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
387; GFX10-W32-NEXT:    s_cbranch_execz .LBB9_2
388; GFX10-W32-NEXT:  ; %bb.1: ; %if
389; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
390; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
391; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
392; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
393; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
394; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
395; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
396; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
397; GFX10-W32-NEXT:  .LBB9_2: ; %endif
398; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
399; GFX10-W32-NEXT:    ; return to shader part epilog
400main_body:
401  ; use mbcnt to make sure the branch is divergent
402  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
403  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
404  %cc = icmp uge i32 %hi, 16
405  br i1 %cc, label %endif, label %if
406
407if:
408  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
409  %out = fadd float %src, %src
410  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
411  %out.1 = fadd float %src, %out.0
412  br label %endif
413
414endif:
415  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
416  ret float %out.2
417}
418
419; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
420; write could clobber disabled channels in the non-WWM one.
421; We enforce this by checking that v_mov gets emitted in the same block as
422; WWM computations.
423define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
424; GFX9-W64-LABEL: test_wwm4:
425; GFX9-W64:       ; %bb.0: ; %main_body
426; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
427; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
428; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
429; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
430; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
431; GFX9-W64-NEXT:    s_cbranch_execz .LBB10_2
432; GFX9-W64-NEXT:  ; %bb.1: ; %if
433; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
434; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
435; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
436; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
437; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
438; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
439; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
440; GFX9-W64-NEXT:  .LBB10_2: ; %endif
441; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
442; GFX9-W64-NEXT:    ; return to shader part epilog
443;
444; GFX10-W32-LABEL: test_wwm4:
445; GFX10-W32:       ; %bb.0: ; %main_body
446; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
447; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
448; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
449; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
450; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
451; GFX10-W32-NEXT:    s_cbranch_execz .LBB10_2
452; GFX10-W32-NEXT:  ; %bb.1: ; %if
453; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
454; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
455; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
456; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
457; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
458; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
459; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
460; GFX10-W32-NEXT:  .LBB10_2: ; %endif
461; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
462; GFX10-W32-NEXT:    ; return to shader part epilog
463main_body:
464  ; use mbcnt to make sure the branch is divergent
465  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
466  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
467  %cc = icmp uge i32 %hi, 16
468  br i1 %cc, label %endif, label %if
469
470if:
471  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
472  %out = fadd float %src, %src
473  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
474  br label %endif
475
476endif:
477  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
478  ret float %out.1
479}
480
481; Make sure the transition from Exact to WWM then WQM works properly.
482define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
483; GFX9-W64-LABEL: test_wwm5:
484; GFX9-W64:       ; %bb.0: ; %main_body
485; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
486; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
487; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
488; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
489; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
490; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
491; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
492; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
493; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
494; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
495; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
496; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
497; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
498; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
499; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
500; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
501; GFX9-W64-NEXT:    ; return to shader part epilog
502;
503; GFX10-W32-LABEL: test_wwm5:
504; GFX10-W32:       ; %bb.0: ; %main_body
505; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
506; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
507; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
508; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
509; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
510; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
511; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
512; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
513; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
514; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
515; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
516; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
517; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
518; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
519; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
520; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
521; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
522; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
523; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
524; GFX10-W32-NEXT:    ; return to shader part epilog
525main_body:
526  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
527  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
528  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
529  %temp = fadd float %src1, %src1
530  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
531  %out = fadd float %temp.0, %temp.0
532  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
533  ret float %out.0
534}
535
536; Check that WWM is turned on correctly across basic block boundaries.
537; if..then..endif version
538;SI-CHECK: buffer_load_dword
539;VI-CHECK: flat_load_dword
540;SI-CHECK: buffer_load_dword
541;VI-CHECK: flat_load_dword
542define amdgpu_ps float @test_wwm6_then() {
543; GFX9-W64-LABEL: test_wwm6_then:
544; GFX9-W64:       ; %bb.0: ; %main_body
545; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
546; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
547; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
548; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
549; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
550; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
551; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
552; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
553; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
554; GFX9-W64-NEXT:    s_cbranch_execz .LBB12_2
555; GFX9-W64-NEXT:  ; %bb.1: ; %if
556; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
557; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
558; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
560; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
561; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
562; GFX9-W64-NEXT:  .LBB12_2: ; %endif
563; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
564; GFX9-W64-NEXT:    ; return to shader part epilog
565;
566; GFX10-W32-LABEL: test_wwm6_then:
567; GFX10-W32:       ; %bb.0: ; %main_body
568; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
569; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
570; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
571; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
572; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
573; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
574; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
575; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
576; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
577; GFX10-W32-NEXT:    s_cbranch_execz .LBB12_2
578; GFX10-W32-NEXT:  ; %bb.1: ; %if
579; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
580; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
581; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
582; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
583; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
584; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
585; GFX10-W32-NEXT:  .LBB12_2: ; %endif
586; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
587; GFX10-W32-NEXT:    ; return to shader part epilog
588main_body:
589  %src0 = load volatile float, float addrspace(1)* undef
590  ; use mbcnt to make sure the branch is divergent
591  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
592  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
593  %cc = icmp uge i32 %hi, 16
594  br i1 %cc, label %endif, label %if
595
596if:
597  %src1 = load volatile float, float addrspace(1)* undef
598  %out = fadd float %src0, %src1
599  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
600  br label %endif
601
602endif:
603  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
604  ret float %out.1
605}
606
607; Check that WWM is turned on correctly across basic block boundaries.
608; loop version
609;SI-CHECK: buffer_load_dword
610;VI-CHECK: flat_load_dword
611;SI-CHECK: buffer_load_dword
612;VI-CHECK: flat_load_dword
613define amdgpu_ps float @test_wwm6_loop() {
614; GFX9-W64-LABEL: test_wwm6_loop:
615; GFX9-W64:       ; %bb.0: ; %main_body
616; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
617; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
618; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
619; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
620; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
621; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
622; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
623; GFX9-W64-NEXT:  .LBB13_1: ; %loop
624; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
625; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
626; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
627; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
628; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
629; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
630; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
631; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
632; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
633; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
634; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
635; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
636; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
637; GFX9-W64-NEXT:    s_cbranch_execnz .LBB13_1
638; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
639; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
640; GFX9-W64-NEXT:    ; return to shader part epilog
641;
642; GFX10-W32-LABEL: test_wwm6_loop:
643; GFX10-W32:       ; %bb.0: ; %main_body
644; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
645; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
646; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
647; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
648; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
649; GFX10-W32-NEXT:    s_mov_b32 s0, 0
650; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
651; GFX10-W32-NEXT:  .LBB13_1: ; %loop
652; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
653; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
654; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
655; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
656; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
657; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
658; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
659; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
660; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
661; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
662; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
663; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
664; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
665; GFX10-W32-NEXT:    s_cbranch_execnz .LBB13_1
666; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
667; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
668; GFX10-W32-NEXT:    ; return to shader part epilog
669main_body:
670  %src0 = load volatile float, float addrspace(1)* undef
671  ; use mbcnt to make sure the branch is divergent
672  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
673  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
674  br label %loop
675
676loop:
677  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
678  %src1 = load volatile float, float addrspace(1)* undef
679  %out = fadd float %src0, %src1
680  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
681  %counter.1 = sub i32 %counter, 1
682  %cc = icmp ne i32 %counter.1, 0
683  br i1 %cc, label %loop, label %endloop
684
685endloop:
686  ret float %out.0
687}
688
689; Check that @llvm.amdgcn.set.inactive disables WWM.
690define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
691; GFX9-W64-LABEL: test_wwm_set_inactive1:
692; GFX9-W64:       ; %bb.0: ; %main_body
693; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
694; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
695; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
696; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
697; GFX9-W64-NEXT:    s_not_b64 exec, exec
698; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
699; GFX9-W64-NEXT:    s_not_b64 exec, exec
700; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
701; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
702; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
703; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
704; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
705; GFX9-W64-NEXT:    s_endpgm
706;
707; GFX10-W32-LABEL: test_wwm_set_inactive1:
708; GFX10-W32:       ; %bb.0: ; %main_body
709; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
710; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
711; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
712; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
713; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
714; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
715; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
716; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
717; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
718; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
719; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
720; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
721; GFX10-W32-NEXT:    s_endpgm
722main_body:
723  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
724  %src.0 = bitcast float %src to i32
725  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
726  %out = add i32 %src.1, %src.1
727  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
728  %out.1 = bitcast i32 %out.0 to float
729  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
730  ret void
731}
732
733; Check that Strict WQM is triggered by the strict_wqm intrinsic.
734define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
735; GFX9-W64-LABEL: test_strict_wqm1:
736; GFX9-W64:       ; %bb.0: ; %main_body
737; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
738; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
739; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
740; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
741; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
742; GFX9-W64-NEXT:    s_nop 0
743; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
744; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
745; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
746; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
747; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
748; GFX9-W64-NEXT:    ; return to shader part epilog
749;
750; GFX10-W32-LABEL: test_strict_wqm1:
751; GFX10-W32:       ; %bb.0: ; %main_body
752; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
753; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
754; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
755; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
756; GFX10-W32-NEXT:    s_clause 0x1
757; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
758; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
759; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
760; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
761; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
762; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
763; GFX10-W32-NEXT:    ; return to shader part epilog
764main_body:
765  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
766  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
767  %out = fadd float %src0, %src1
768  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
769  ret float %out.0
770}
771
772; Same as above, but with an integer type.
773define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
774; GFX9-W64-LABEL: test_strict_wqm2:
775; GFX9-W64:       ; %bb.0: ; %main_body
776; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
777; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
778; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
779; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
780; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
781; GFX9-W64-NEXT:    s_nop 0
782; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
783; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
784; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
785; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
786; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
787; GFX9-W64-NEXT:    ; return to shader part epilog
788;
789; GFX10-W32-LABEL: test_strict_wqm2:
790; GFX10-W32:       ; %bb.0: ; %main_body
791; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
792; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
793; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
794; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
795; GFX10-W32-NEXT:    s_clause 0x1
796; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
797; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
798; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
799; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
800; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
801; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
802; GFX10-W32-NEXT:    ; return to shader part epilog
803main_body:
804  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
805  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
806  %src0.0 = bitcast float %src0 to i32
807  %src1.0 = bitcast float %src1 to i32
808  %out = add i32 %src0.0, %src1.0
809  %out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
810  %out.1 = bitcast i32 %out.0 to float
811  ret float %out.1
812}
813
814; Check that we don't leave Strict WQM on for computations that don't require it,
815; since that will lead clobbering things that aren't supposed to be clobbered
816; in cases like this.
817; We enforce this by checking that v_add gets emitted in the same block as
818; WWM computations.
819define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
820; GFX9-W64-LABEL: test_strict_wqm3:
821; GFX9-W64:       ; %bb.0: ; %main_body
822; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
823; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
824; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
825; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
826; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
827; GFX9-W64-NEXT:    s_cbranch_execz .LBB17_2
828; GFX9-W64-NEXT:  ; %bb.1: ; %if
829; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
830; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
831; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
832; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
833; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
834; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
835; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
836; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
837; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
838; GFX9-W64-NEXT:  .LBB17_2: ; %endif
839; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
840; GFX9-W64-NEXT:    ; return to shader part epilog
841;
842; GFX10-W32-LABEL: test_strict_wqm3:
843; GFX10-W32:       ; %bb.0: ; %main_body
844; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
845; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
846; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
847; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
848; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
849; GFX10-W32-NEXT:    s_cbranch_execz .LBB17_2
850; GFX10-W32-NEXT:  ; %bb.1: ; %if
851; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
852; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
853; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
854; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
855; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
856; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
857; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
858; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
859; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
860; GFX10-W32-NEXT:  .LBB17_2: ; %endif
861; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
862; GFX10-W32-NEXT:    ; return to shader part epilog
863main_body:
864  ; use mbcnt to make sure the branch is divergent
865  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
866  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
867  %cc = icmp uge i32 %hi, 16
868  br i1 %cc, label %endif, label %if
869
870if:
871  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
872  %out = fadd float %src, %src
873  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
874  %out.1 = fadd float %src, %out.0
875  br label %endif
876
877endif:
878  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
879  ret float %out.2
880}
881
882; Check that Strict WQM writes aren't coalesced with non-strict writes, since
883; the Strict WQM write could clobber disabled channels in the non-strict one.
884; We enforce this by checking that v_mov gets emitted in the same block as
885; WWM computations.
886define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
887; GFX9-W64-LABEL: test_strict_wqm4:
888; GFX9-W64:       ; %bb.0: ; %main_body
889; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
890; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
891; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
892; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
893; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
894; GFX9-W64-NEXT:    s_cbranch_execz .LBB18_2
895; GFX9-W64-NEXT:  ; %bb.1: ; %if
896; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
897; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
898; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
899; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
900; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
901; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
902; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
903; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
904; GFX9-W64-NEXT:  .LBB18_2: ; %endif
905; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
906; GFX9-W64-NEXT:    ; return to shader part epilog
907;
908; GFX10-W32-LABEL: test_strict_wqm4:
909; GFX10-W32:       ; %bb.0: ; %main_body
910; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
911; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
912; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
913; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
914; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
915; GFX10-W32-NEXT:    s_cbranch_execz .LBB18_2
916; GFX10-W32-NEXT:  ; %bb.1: ; %if
917; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
918; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
919; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
920; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
921; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
922; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
923; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
924; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
925; GFX10-W32-NEXT:  .LBB18_2: ; %endif
926; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
927; GFX10-W32-NEXT:    ; return to shader part epilog
928main_body:
929  ; use mbcnt to make sure the branch is divergent
930  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
931  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
932  %cc = icmp uge i32 %hi, 16
933  br i1 %cc, label %endif, label %if
934
935if:
936  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
937  %out = fadd float %src, %src
938  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
939  br label %endif
940
941endif:
942  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
943  ret float %out.1
944}
945
946; Make sure the transition from Exact to Strict WQM then WQM works properly.
947define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
948; GFX9-W64-LABEL: test_strict_wqm5:
949; GFX9-W64:       ; %bb.0: ; %main_body
950; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
951; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
952; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
953; GFX9-W64-NEXT:    s_mov_b64 s[4:5], exec
954; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
955; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
956; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
957; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
958; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
959; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
960; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
961; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
962; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
963; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
964; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
965; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
966; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
967; GFX9-W64-NEXT:    ; return to shader part epilog
968;
969; GFX10-W32-LABEL: test_strict_wqm5:
970; GFX10-W32:       ; %bb.0: ; %main_body
971; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
972; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
973; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
974; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
975; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
976; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
977; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
978; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
979; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
980; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
981; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
982; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
983; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
984; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
985; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
986; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
987; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
988; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
989; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
990; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
991; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
992; GFX10-W32-NEXT:    ; return to shader part epilog
993main_body:
994  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
995  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
996  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
997  %temp = fadd float %src1, %src1
998  %temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
999  %out = fadd float %temp.0, %temp.0
1000  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
1001  ret float %out.0
1002}
1003
1004; Check that Strict WQM is turned on correctly across basic block boundaries.
1005; if..then..endif version
1006;SI-CHECK: buffer_load_dword
1007;VI-CHECK: flat_load_dword
1008;SI-CHECK: buffer_load_dword
1009;VI-CHECK: flat_load_dword
1010define amdgpu_ps float @test_strict_wqm6_then() {
1011; GFX9-W64-LABEL: test_strict_wqm6_then:
1012; GFX9-W64:       ; %bb.0: ; %main_body
1013; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1014; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1015; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1016; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1017; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1018; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1019; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1020; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
1021; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
1022; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1023; GFX9-W64-NEXT:    s_cbranch_execz .LBB20_2
1024; GFX9-W64-NEXT:  ; %bb.1: ; %if
1025; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1026; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1027; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1028; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1029; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
1030; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1031; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
1032; GFX9-W64-NEXT:  .LBB20_2: ; %endif
1033; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1034; GFX9-W64-NEXT:    ; return to shader part epilog
1035;
1036; GFX10-W32-LABEL: test_strict_wqm6_then:
1037; GFX10-W32:       ; %bb.0: ; %main_body
1038; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1039; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1040; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1041; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1042; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1043; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1044; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
1045; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
1046; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
1047; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1048; GFX10-W32-NEXT:    s_cbranch_execz .LBB20_2
1049; GFX10-W32-NEXT:  ; %bb.1: ; %if
1050; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1051; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1052; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1053; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1054; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
1055; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1056; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
1057; GFX10-W32-NEXT:  .LBB20_2: ; %endif
1058; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1059; GFX10-W32-NEXT:    ; return to shader part epilog
1060main_body:
1061  %src0 = load volatile float, float addrspace(1)* undef
1062  ; use mbcnt to make sure the branch is divergent
1063  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1064  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1065  %cc = icmp uge i32 %hi, 16
1066  br i1 %cc, label %endif, label %if
1067
1068if:
1069  %src1 = load volatile float, float addrspace(1)* undef
1070  %out = fadd float %src0, %src1
1071  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1072  br label %endif
1073
1074endif:
1075  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
1076  ret float %out.1
1077}
1078
1079; Check that Strict WQM is turned on correctly across basic block boundaries.
1080; loop version
1081;SI-CHECK: buffer_load_dword
1082;VI-CHECK: flat_load_dword
1083;SI-CHECK: buffer_load_dword
1084;VI-CHECK: flat_load_dword
1085define amdgpu_ps float @test_strict_wqm6_loop() {
1086; GFX9-W64-LABEL: test_strict_wqm6_loop:
1087; GFX9-W64:       ; %bb.0: ; %main_body
1088; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1089; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1090; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
1091; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1092; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
1093; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1094; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1095; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
1096; GFX9-W64-NEXT:  .LBB21_1: ; %loop
1097; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1098; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1099; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1100; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
1101; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1102; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1103; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
1104; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1105; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1106; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1107; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
1108; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
1109; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
1110; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1111; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
1112; GFX9-W64-NEXT:    s_cbranch_execnz .LBB21_1
1113; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
1114; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1115; GFX9-W64-NEXT:    ; return to shader part epilog
1116;
1117; GFX10-W32-LABEL: test_strict_wqm6_loop:
1118; GFX10-W32:       ; %bb.0: ; %main_body
1119; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1120; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1121; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
1122; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1123; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
1124; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
1125; GFX10-W32-NEXT:    s_mov_b32 s0, 0
1126; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
1127; GFX10-W32-NEXT:  .LBB21_1: ; %loop
1128; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1129; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1130; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1131; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
1132; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1133; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1134; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
1135; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
1136; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1137; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
1138; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
1139; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
1140; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1141; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
1142; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
1143; GFX10-W32-NEXT:    s_cbranch_execnz .LBB21_1
1144; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
1145; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1146; GFX10-W32-NEXT:    ; return to shader part epilog
1147main_body:
1148  %src0 = load volatile float, float addrspace(1)* undef
1149  ; use mbcnt to make sure the branch is divergent
1150  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
1151  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
1152  br label %loop
1153
1154loop:
1155  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
1156  %src1 = load volatile float, float addrspace(1)* undef
1157  %out = fadd float %src0, %src1
1158  %out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
1159  %counter.1 = sub i32 %counter, 1
1160  %cc = icmp ne i32 %counter.1, 0
1161  br i1 %cc, label %loop, label %endloop
1162
1163endloop:
1164  ret float %out.0
1165}
1166
1167; Check that enabling WQM anywhere enables WQM for the set.inactive source.
1168define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
1169; GFX9-W64-LABEL: test_set_inactive2:
1170; GFX9-W64:       ; %bb.0: ; %main_body
1171; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
1172; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1173; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
1174; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s0
1175; GFX9-W64-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 idxen
1176; GFX9-W64-NEXT:    s_nop 0
1177; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
1178; GFX9-W64-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
1179; GFX9-W64-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $exec killed $exec
1180; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
1181; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1182; GFX9-W64-NEXT:    v_add_u32_e32 v1, v2, v1
1183; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1184; GFX9-W64-NEXT:    s_endpgm
1185;
1186; GFX10-W32-LABEL: test_set_inactive2:
1187; GFX10-W32:       ; %bb.0: ; %main_body
1188; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
1189; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1190; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s1
1191; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
1192; GFX10-W32-NEXT:    s_clause 0x1
1193; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
1194; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
1195; GFX10-W32-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
1196; GFX10-W32-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $exec killed $exec
1197; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
1198; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1199; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
1200; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1201; GFX10-W32-NEXT:    s_endpgm
1202main_body:
1203  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1204  %src1.0 = bitcast float %src1 to i32
1205  %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
1206  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
1207  %src0.0 = bitcast float %src0 to i32
1208  %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
1209  %out = add i32 %src0.1, %src1.1
1210  %out.0 = bitcast i32 %out to float
1211  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
1212  ret void
1213}
1214
1215; Check a case of one branch of an if-else requiring WQM, the other requiring
1216; exact.
1217; Note: In this particular case, the save-and-restore could be avoided if the
1218; analysis understood that the two branches of the if-else are mutually
1219; exclusive.
1220define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1221; GFX9-W64-LABEL: test_control_flow_0:
1222; GFX9-W64:       ; %bb.0: ; %main_body
1223; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1224; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1225; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1226; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1227; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1228; GFX9-W64-NEXT:    s_cbranch_execz .LBB23_2
1229; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1230; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1231; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1232; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1233; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1234; GFX9-W64-NEXT:  .LBB23_2: ; %Flow
1235; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1236; GFX9-W64-NEXT:    s_cbranch_execz .LBB23_4
1237; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1238; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1239; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1240; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1241; GFX9-W64-NEXT:  .LBB23_4: ; %END
1242; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1243; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1244; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1245; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1246; GFX9-W64-NEXT:    ; return to shader part epilog
1247;
1248; GFX10-W32-LABEL: test_control_flow_0:
1249; GFX10-W32:       ; %bb.0: ; %main_body
1250; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1251; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1252; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1253; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1254; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1255; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_2
1256; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1257; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1258; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1259; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1260; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1261; GFX10-W32-NEXT:  .LBB23_2: ; %Flow
1262; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1263; GFX10-W32-NEXT:    s_cbranch_execz .LBB23_4
1264; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1265; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1266; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1267; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1268; GFX10-W32-NEXT:  .LBB23_4: ; %END
1269; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1270; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1271; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1272; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1273; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1274; GFX10-W32-NEXT:    ; return to shader part epilog
1275main_body:
1276  %cmp = icmp eq i32 %z, 0
1277  br i1 %cmp, label %IF, label %ELSE
1278
1279IF:
1280  %c.bc = bitcast i32 %c to float
1281  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1282  %tex0 = extractelement <4 x float> %tex, i32 0
1283  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1284  %data.if = extractelement <4 x float> %dtex, i32 0
1285  br label %END
1286
1287ELSE:
1288  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
1289  br label %END
1290
1291END:
1292  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1293  ret float %r
1294}
1295
1296; Reverse branch order compared to the previous test.
1297define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
1298; GFX9-W64-LABEL: test_control_flow_1:
1299; GFX9-W64:       ; %bb.0: ; %main_body
1300; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1301; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1302; GFX9-W64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1303; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1304; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1305; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_2
1306; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1307; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1308; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1309; GFX9-W64-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
1310; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1311; GFX9-W64-NEXT:  .LBB24_2: ; %Flow
1312; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], s[14:15]
1313; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1314; GFX9-W64-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
1315; GFX9-W64-NEXT:    s_xor_b64 exec, exec, s[0:1]
1316; GFX9-W64-NEXT:    s_cbranch_execz .LBB24_4
1317; GFX9-W64-NEXT:  ; %bb.3: ; %ELSE
1318; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1319; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1320; GFX9-W64-NEXT:  .LBB24_4: ; %END
1321; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1322; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1323; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
1324; GFX9-W64-NEXT:    ; return to shader part epilog
1325;
1326; GFX10-W32-LABEL: test_control_flow_1:
1327; GFX10-W32:       ; %bb.0: ; %main_body
1328; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1329; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1330; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1331; GFX10-W32-NEXT:    v_cmpx_ne_u32_e32 0, v1
1332; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1333; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_2
1334; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1335; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1336; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1337; GFX10-W32-NEXT:    image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1338; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1339; GFX10-W32-NEXT:  .LBB24_2: ; %Flow
1340; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, s13
1341; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1342; GFX10-W32-NEXT:    s_and_b32 s0, exec_lo, s0
1343; GFX10-W32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
1344; GFX10-W32-NEXT:    s_cbranch_execz .LBB24_4
1345; GFX10-W32-NEXT:  ; %bb.3: ; %ELSE
1346; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1347; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1348; GFX10-W32-NEXT:  .LBB24_4: ; %END
1349; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1350; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1351; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
1352; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1353; GFX10-W32-NEXT:    ; return to shader part epilog
1354main_body:
1355  %cmp = icmp eq i32 %z, 0
1356  br i1 %cmp, label %ELSE, label %IF
1357
1358IF:
1359  %c.bc = bitcast i32 %c to float
1360  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1361  %tex0 = extractelement <4 x float> %tex, i32 0
1362  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1363  %data.if = extractelement <4 x float> %dtex, i32 0
1364  br label %END
1365
1366ELSE:
1367  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 %c, i32 0, i32 0, i32 0)
1368  br label %END
1369
1370END:
1371  %r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
1372  ret float %r
1373}
1374
1375; Check that branch conditions are properly marked as needing WQM...
1376define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
1377; GFX9-W64-LABEL: test_control_flow_2:
1378; GFX9-W64:       ; %bb.0: ; %main_body
1379; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1380; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1381; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1382; GFX9-W64-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1383; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1384; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1385; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1386; GFX9-W64-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1387; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1388; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
1389; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1390; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1391; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1392; GFX9-W64-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
1393; GFX9-W64-NEXT:  ; %bb.1: ; %ELSE
1394; GFX9-W64-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1395; GFX9-W64-NEXT:    ; implicit-def: $vgpr5
1396; GFX9-W64-NEXT:  ; %bb.2: ; %Flow
1397; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[14:15], s[14:15]
1398; GFX9-W64-NEXT:  ; %bb.3: ; %IF
1399; GFX9-W64-NEXT:    v_mul_lo_u32 v0, v5, 3
1400; GFX9-W64-NEXT:  ; %bb.4: ; %END
1401; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1402; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1403; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1404; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1405; GFX9-W64-NEXT:    ; return to shader part epilog
1406;
1407; GFX10-W32-LABEL: test_control_flow_2:
1408; GFX10-W32:       ; %bb.0: ; %main_body
1409; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1410; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1411; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1412; GFX10-W32-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 idxen
1413; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1414; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[0:3], 0 idxen
1415; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1416; GFX10-W32-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0, v0
1417; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1418; GFX10-W32-NEXT:    buffer_store_dword v4, v2, s[0:3], 0 idxen
1419; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1420; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1421; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
1422; GFX10-W32-NEXT:    s_xor_b32 s13, exec_lo, s13
1423; GFX10-W32-NEXT:  ; %bb.1: ; %ELSE
1424; GFX10-W32-NEXT:    v_lshlrev_b32_e32 v0, 2, v5
1425; GFX10-W32-NEXT:    ; implicit-def: $vgpr5
1426; GFX10-W32-NEXT:  ; %bb.2: ; %Flow
1427; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s13, s13
1428; GFX10-W32-NEXT:  ; %bb.3: ; %IF
1429; GFX10-W32-NEXT:    v_mul_lo_u32 v0, v5, 3
1430; GFX10-W32-NEXT:  ; %bb.4: ; %END
1431; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1432; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1433; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1434; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1435; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1436; GFX10-W32-NEXT:    ; return to shader part epilog
1437main_body:
1438  %idx.1 = extractelement <3 x i32> %idx, i32 0
1439  %data.1 = extractelement <2 x float> %data, i32 0
1440  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
1441
1442  ; The load that determines the branch (and should therefore be WQM) is
1443  ; surrounded by stores that require disabled WQM.
1444  %idx.2 = extractelement <3 x i32> %idx, i32 1
1445  %z = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx.2, i32 0, i32 0, i32 0)
1446
1447  %idx.3 = extractelement <3 x i32> %idx, i32 2
1448  %data.3 = extractelement <2 x float> %data, i32 1
1449  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.3, <4 x i32> undef, i32 %idx.3, i32 0, i32 0, i32 0)
1450
1451  %cc = fcmp ogt float %z, 0.0
1452  br i1 %cc, label %IF, label %ELSE
1453
1454IF:
1455  %coord.IF = mul i32 %coord, 3
1456  br label %END
1457
1458ELSE:
1459  %coord.ELSE = mul i32 %coord, 4
1460  br label %END
1461
1462END:
1463  %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
1464  %coord.END.bc = bitcast i32 %coord.END to float
1465  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1466  ret <4 x float> %tex
1467}
1468
1469; ... but only if they really do need it.
1470define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
1471; GFX9-W64-LABEL: test_control_flow_3:
1472; GFX9-W64:       ; %bb.0: ; %main_body
1473; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1474; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1475; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1476; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1477; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1478; GFX9-W64-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
1479; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1480; GFX9-W64-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v1
1481; GFX9-W64-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1482; GFX9-W64-NEXT:    ; implicit-def: $vgpr0
1483; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1484; GFX9-W64-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1485; GFX9-W64-NEXT:    s_cbranch_execnz .LBB26_3
1486; GFX9-W64-NEXT:  ; %bb.1: ; %Flow
1487; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1488; GFX9-W64-NEXT:    s_cbranch_execnz .LBB26_4
1489; GFX9-W64-NEXT:  .LBB26_2: ; %END
1490; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1491; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1492; GFX9-W64-NEXT:    s_branch .LBB26_5
1493; GFX9-W64-NEXT:  .LBB26_3: ; %ELSE
1494; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1495; GFX9-W64-NEXT:    ; implicit-def: $vgpr1
1496; GFX9-W64-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1497; GFX9-W64-NEXT:    s_cbranch_execz .LBB26_2
1498; GFX9-W64-NEXT:  .LBB26_4: ; %IF
1499; GFX9-W64-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1500; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
1501; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1502; GFX9-W64-NEXT:    s_branch .LBB26_5
1503; GFX9-W64-NEXT:  .LBB26_5:
1504;
1505; GFX10-W32-LABEL: test_control_flow_3:
1506; GFX10-W32:       ; %bb.0: ; %main_body
1507; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1508; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1509; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1510; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1511; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1512; GFX10-W32-NEXT:    image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1513; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1514; GFX10-W32-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 idxen
1515; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1516; GFX10-W32-NEXT:    ; implicit-def: $vgpr0
1517; GFX10-W32-NEXT:    v_cmpx_nlt_f32_e32 0, v1
1518; GFX10-W32-NEXT:    s_xor_b32 s0, exec_lo, s0
1519; GFX10-W32-NEXT:    s_cbranch_execnz .LBB26_3
1520; GFX10-W32-NEXT:  ; %bb.1: ; %Flow
1521; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1522; GFX10-W32-NEXT:    s_cbranch_execnz .LBB26_4
1523; GFX10-W32-NEXT:  .LBB26_2: ; %END
1524; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1525; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1526; GFX10-W32-NEXT:    s_branch .LBB26_5
1527; GFX10-W32-NEXT:  .LBB26_3: ; %ELSE
1528; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 4.0, v1
1529; GFX10-W32-NEXT:    ; implicit-def: $vgpr1
1530; GFX10-W32-NEXT:    s_andn2_saveexec_b32 s0, s0
1531; GFX10-W32-NEXT:    s_cbranch_execz .LBB26_2
1532; GFX10-W32-NEXT:  .LBB26_4: ; %IF
1533; GFX10-W32-NEXT:    v_mul_f32_e32 v0, 0x40400000, v1
1534; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1535; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1536; GFX10-W32-NEXT:    s_branch .LBB26_5
1537; GFX10-W32-NEXT:  .LBB26_5:
1538main_body:
1539  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1540  %tex0 = extractelement <4 x float> %tex, i32 0
1541  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1542  %dtex.1 = extractelement <4 x float> %dtex, i32 0
1543  call void @llvm.amdgcn.struct.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
1544
1545  %cc = fcmp ogt float %dtex.1, 0.0
1546  br i1 %cc, label %IF, label %ELSE
1547
1548IF:
1549  %tex.IF = fmul float %dtex.1, 3.0
1550  br label %END
1551
1552ELSE:
1553  %tex.ELSE = fmul float %dtex.1, 4.0
1554  br label %END
1555
1556END:
1557  %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
1558  ret float %tex.END
1559}
1560
1561; Another test that failed at some point because of terminator handling.
1562define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
1563; GFX9-W64-LABEL: test_control_flow_4:
1564; GFX9-W64:       ; %bb.0: ; %main_body
1565; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1566; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1567; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1568; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
1569; GFX9-W64-NEXT:    s_cbranch_execz .LBB27_2
1570; GFX9-W64-NEXT:  ; %bb.1: ; %IF
1571; GFX9-W64-NEXT:    s_and_saveexec_b64 s[16:17], s[12:13]
1572; GFX9-W64-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1573; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 1
1574; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1575; GFX9-W64-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1576; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
1577; GFX9-W64-NEXT:  .LBB27_2: ; %END
1578; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
1579; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
1580; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1581; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1582; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1583; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1584; GFX9-W64-NEXT:    ; return to shader part epilog
1585;
1586; GFX10-W32-LABEL: test_control_flow_4:
1587; GFX10-W32:       ; %bb.0: ; %main_body
1588; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1589; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1590; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
1591; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
1592; GFX10-W32-NEXT:    s_cbranch_execz .LBB27_2
1593; GFX10-W32-NEXT:  ; %bb.1: ; %IF
1594; GFX10-W32-NEXT:    s_and_saveexec_b32 s14, s12
1595; GFX10-W32-NEXT:    buffer_load_dword v1, off, s[0:3], 0
1596; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 1
1597; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1598; GFX10-W32-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 idxen
1599; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
1600; GFX10-W32-NEXT:  .LBB27_2: ; %END
1601; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
1602; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1603; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1604; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1605; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1606; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1607; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1608; GFX10-W32-NEXT:    ; return to shader part epilog
1609main_body:
1610  %cond = icmp eq i32 %y, 0
1611  br i1 %cond, label %IF, label %END
1612
1613IF:
1614  %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> undef, i32 0, i32 0, i32 0)
1615  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
1616  br label %END
1617
1618END:
1619  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1620  %tex0 = extractelement <4 x float> %tex, i32 0
1621  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1622  ret <4 x float> %dtex
1623}
1624
1625; Kill is performed in WQM mode so that uniform kill behaves correctly ...
1626define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
1627; GFX9-W64-LABEL: test_kill_0:
1628; GFX9-W64:       ; %bb.0: ; %main_body
1629; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1630; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1631; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1632; GFX9-W64-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
1633; GFX9-W64-NEXT:    s_nop 0
1634; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1635; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1636; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v6
1637; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1638; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB28_2
1639; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1640; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1641; GFX9-W64-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
1642; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1643; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1644; GFX9-W64-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
1645; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1646; GFX9-W64-NEXT:    v_add_f32_e32 v0, v7, v11
1647; GFX9-W64-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1648; GFX9-W64-NEXT:    v_add_f32_e32 v1, v8, v12
1649; GFX9-W64-NEXT:    v_add_f32_e32 v2, v9, v13
1650; GFX9-W64-NEXT:    v_add_f32_e32 v3, v10, v14
1651; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1652; GFX9-W64-NEXT:    s_branch .LBB28_3
1653; GFX9-W64-NEXT:  .LBB28_2:
1654; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1655; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1656; GFX9-W64-NEXT:    s_endpgm
1657; GFX9-W64-NEXT:  .LBB28_3:
1658;
1659; GFX10-W32-LABEL: test_kill_0:
1660; GFX10-W32:       ; %bb.0: ; %main_body
1661; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1662; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1663; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1664; GFX10-W32-NEXT:    image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1665; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
1666; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1667; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v6
1668; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1669; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB28_2
1670; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1671; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1672; GFX10-W32-NEXT:    image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1673; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1674; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1675; GFX10-W32-NEXT:    image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1676; GFX10-W32-NEXT:    buffer_store_dword v3, v1, s[0:3], 0 idxen
1677; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1678; GFX10-W32-NEXT:    v_add_f32_e32 v4, v8, v12
1679; GFX10-W32-NEXT:    v_add_f32_e32 v5, v10, v14
1680; GFX10-W32-NEXT:    v_add_f32_e32 v0, v7, v11
1681; GFX10-W32-NEXT:    v_add_f32_e32 v2, v9, v13
1682; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v4
1683; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v5
1684; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1685; GFX10-W32-NEXT:    s_branch .LBB28_3
1686; GFX10-W32-NEXT:  .LBB28_2:
1687; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1688; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1689; GFX10-W32-NEXT:    s_endpgm
1690; GFX10-W32-NEXT:  .LBB28_3:
1691main_body:
1692  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1693  %idx.0 = extractelement <2 x i32> %idx, i32 0
1694  %data.0 = extractelement <2 x float> %data, i32 0
1695  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i32 0, i32 0)
1696
1697  %z.cmp = fcmp olt float %z, 0.0
1698  call void @llvm.amdgcn.kill(i1 %z.cmp)
1699
1700  %idx.1 = extractelement <2 x i32> %idx, i32 1
1701  %data.1 = extractelement <2 x float> %data, i32 1
1702  call void @llvm.amdgcn.struct.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i32 0, i32 0)
1703  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1704  %tex2.0 = extractelement <4 x float> %tex2, i32 0
1705  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1706  %out = fadd <4 x float> %tex, %dtex
1707
1708  ret <4 x float> %out
1709}
1710
1711; ... but only if WQM is necessary.
1712; CHECK-LABEL: {{^}}test_kill_1:
1713; CHECK-NEXT: ; %main_body
1714; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
1715; CHECK: s_wqm_b64 exec, exec
1716; CHECK: image_sample
1717; CHECK: s_and_b64 exec, exec, [[ORIG]]
1718; CHECK: image_sample
1719; CHECK-NOT: wqm
1720; CHECK-DAG: buffer_store_dword
1721; CHECK-DAG: v_cmp_
1722define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
1723; GFX9-W64-LABEL: test_kill_1:
1724; GFX9-W64:       ; %bb.0: ; %main_body
1725; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
1726; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v2
1727; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1728; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v0
1729; GFX9-W64-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1730; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
1731; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1732; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
1733; GFX9-W64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v4
1734; GFX9-W64-NEXT:    s_andn2_b64 s[12:13], s[12:13], vcc
1735; GFX9-W64-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1736; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB29_2
1737; GFX9-W64-NEXT:  ; %bb.1: ; %main_body
1738; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, vcc
1739; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1740; GFX9-W64-NEXT:    s_branch .LBB29_3
1741; GFX9-W64-NEXT:  .LBB29_2:
1742; GFX9-W64-NEXT:    s_mov_b64 exec, 0
1743; GFX9-W64-NEXT:    exp null off, off, off, off done vm
1744; GFX9-W64-NEXT:    s_endpgm
1745; GFX9-W64-NEXT:  .LBB29_3:
1746;
1747; GFX10-W32-LABEL: test_kill_1:
1748; GFX10-W32:       ; %bb.0: ; %main_body
1749; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
1750; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v2
1751; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1752; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v0
1753; GFX10-W32-NEXT:    image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1754; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
1755; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1756; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
1757; GFX10-W32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v4
1758; GFX10-W32-NEXT:    buffer_store_dword v5, off, s[0:3], 0
1759; GFX10-W32-NEXT:    s_andn2_b32 s12, s12, vcc_lo
1760; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB29_2
1761; GFX10-W32-NEXT:  ; %bb.1: ; %main_body
1762; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
1763; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1764; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1765; GFX10-W32-NEXT:    s_branch .LBB29_3
1766; GFX10-W32-NEXT:  .LBB29_2:
1767; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
1768; GFX10-W32-NEXT:    exp null off, off, off, off done vm
1769; GFX10-W32-NEXT:    s_endpgm
1770; GFX10-W32-NEXT:  .LBB29_3:
1771main_body:
1772  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1773  %tex0 = extractelement <4 x float> %tex, i32 0
1774  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
1775
1776  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
1777
1778  %z.cmp = fcmp olt float %z, 0.0
1779  call void @llvm.amdgcn.kill(i1 %z.cmp)
1780
1781  ret <4 x float> %dtex
1782}
1783
1784; Check prolog shaders.
1785; CHECK-LABEL: {{^}}test_prolog_1:
1786; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
1787; CHECK: s_wqm_b64 exec, exec
1788; CHECK: v_add_f32_e32 v0,
1789; CHECK: s_and_b64 exec, exec, [[ORIG]]
1790define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
1791; GFX9-W64-LABEL: test_prolog_1:
1792; GFX9-W64:       ; %bb.0: ; %main_body
1793; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1794; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1795; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v1
1796; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1797; GFX9-W64-NEXT:    ; return to shader part epilog
1798;
1799; GFX10-W32-LABEL: test_prolog_1:
1800; GFX10-W32:       ; %bb.0: ; %main_body
1801; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1802; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1803; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v1
1804; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1805; GFX10-W32-NEXT:    ; return to shader part epilog
1806main_body:
1807  %s = fadd float %a, %b
1808  ret float %s
1809}
1810
1811; CHECK-LABEL: {{^}}test_loop_vcc:
1812; CHECK-NEXT: ; %entry
1813; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
1814; CHECK: s_wqm_b64 exec, exec
1815; CHECK: v_mov
1816; CHECK: v_mov
1817; CHECK: v_mov
1818; CHECK: v_mov
1819; CHECK: s_and_b64 exec, exec, [[LIVE]]
1820; CHECK: image_store
1821; CHECK: s_wqm_b64 exec, exec
1822; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
1823; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
1824
1825; CHECK: [[LOOPHDR:.LBB[0-9]+_[0-9]+]]: ; %body
1826; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
1827; CHECK: [[LOOP:.LBB[0-9]+_[0-9]+]]: ; %loop
1828; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
1829; CHECK: s_cbranch_vccz [[LOOPHDR]]
1830
1831; CHECK: ; %break
1832; CHECK: ; return
1833define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
1834; GFX9-W64-LABEL: test_loop_vcc:
1835; GFX9-W64:       ; %bb.0: ; %entry
1836; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1837; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1838; GFX9-W64-NEXT:    v_mov_b32_e32 v7, v3
1839; GFX9-W64-NEXT:    v_mov_b32_e32 v6, v2
1840; GFX9-W64-NEXT:    v_mov_b32_e32 v5, v1
1841; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
1842; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1843; GFX9-W64-NEXT:    image_store v[4:7], v0, s[0:7] dmask:0xf unorm
1844; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1845; GFX9-W64-NEXT:    v_mov_b32_e32 v8, 0
1846; GFX9-W64-NEXT:    s_mov_b32 s4, 0x40e00000
1847; GFX9-W64-NEXT:    s_branch .LBB31_2
1848; GFX9-W64-NEXT:  .LBB31_1: ; %body
1849; GFX9-W64-NEXT:    ; in Loop: Header=BB31_2 Depth=1
1850; GFX9-W64-NEXT:    image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
1851; GFX9-W64-NEXT:    v_add_f32_e32 v8, 2.0, v8
1852; GFX9-W64-NEXT:    s_cbranch_execz .LBB31_4
1853; GFX9-W64-NEXT:  .LBB31_2: ; %loop
1854; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
1855; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1856; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v4
1857; GFX9-W64-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v8
1858; GFX9-W64-NEXT:    v_mov_b32_e32 v1, v5
1859; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v6
1860; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v7
1861; GFX9-W64-NEXT:    s_cbranch_vccz .LBB31_1
1862; GFX9-W64-NEXT:  ; %bb.3:
1863; GFX9-W64-NEXT:    ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
1864; GFX9-W64-NEXT:    ; implicit-def: $vgpr8
1865; GFX9-W64-NEXT:  .LBB31_4: ; %break
1866; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1867; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1868; GFX9-W64-NEXT:    ; return to shader part epilog
1869;
1870; GFX10-W32-LABEL: test_loop_vcc:
1871; GFX10-W32:       ; %bb.0: ; %entry
1872; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1873; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1874; GFX10-W32-NEXT:    v_mov_b32_e32 v8, 0
1875; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1876; GFX10-W32-NEXT:    image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
1877; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1878; GFX10-W32-NEXT:    s_branch .LBB31_2
1879; GFX10-W32-NEXT:    .p2align 6
1880; GFX10-W32-NEXT:  .LBB31_1: ; %body
1881; GFX10-W32-NEXT:    ; in Loop: Header=BB31_2 Depth=1
1882; GFX10-W32-NEXT:    image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1883; GFX10-W32-NEXT:    v_add_f32_e32 v8, 2.0, v8
1884; GFX10-W32-NEXT:    s_cbranch_execz .LBB31_4
1885; GFX10-W32-NEXT:  .LBB31_2: ; %loop
1886; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
1887; GFX10-W32-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
1888; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1889; GFX10-W32-NEXT:    v_mov_b32_e32 v7, v3
1890; GFX10-W32-NEXT:    v_mov_b32_e32 v6, v2
1891; GFX10-W32-NEXT:    v_mov_b32_e32 v5, v1
1892; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
1893; GFX10-W32-NEXT:    s_cbranch_vccz .LBB31_1
1894; GFX10-W32-NEXT:  ; %bb.3:
1895; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1896; GFX10-W32-NEXT:    ; implicit-def: $vgpr8
1897; GFX10-W32-NEXT:  .LBB31_4: ; %break
1898; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1899; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1900; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v4
1901; GFX10-W32-NEXT:    v_mov_b32_e32 v1, v5
1902; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v6
1903; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v7
1904; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1905; GFX10-W32-NEXT:    ; return to shader part epilog
1906entry:
1907  call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
1908  br label %loop
1909
1910loop:
1911  %ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
1912  %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
1913  %cc = fcmp ogt float %ctr.iv, 7.0
1914  br i1 %cc, label %break, label %body
1915
1916body:
1917  %c.iv0 = extractelement <4 x float> %c.iv, i32 0
1918  %c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
1919  %ctr.next = fadd float %ctr.iv, 2.0
1920  br label %loop
1921
1922break:
1923  ret <4 x float> %c.iv
1924}
1925
1926; Only intrinsic stores need exact execution -- other stores do not have
1927; externally visible effects and may require WQM for correctness.
1928; CHECK-LABEL: {{^}}test_alloca:
1929; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
1930; CHECK: s_wqm_b64 exec, exec
1931
1932; CHECK: s_and_b64 exec, exec, [[LIVE]]
1933; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
1934; CHECK: s_wqm_b64 exec, exec
1935; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}}
1936; CHECK: s_and_b64 exec, exec, [[LIVE]]
1937; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
1938; CHECK: s_wqm_b64 exec, exec
1939; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
1940
1941; CHECK: s_and_b64 exec, exec, [[LIVE]]
1942; CHECK: image_sample
1943; CHECK: buffer_store_dwordx4
1944define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
1945; GFX9-W64-LABEL: test_alloca:
1946; GFX9-W64:       ; %bb.0: ; %entry
1947; GFX9-W64-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1948; GFX9-W64-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1949; GFX9-W64-NEXT:    s_mov_b32 s10, -1
1950; GFX9-W64-NEXT:    s_mov_b32 s11, 0xe00000
1951; GFX9-W64-NEXT:    s_add_u32 s8, s8, s0
1952; GFX9-W64-NEXT:    s_addc_u32 s9, s9, 0
1953; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
1954; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1955; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1956; GFX9-W64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1957; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1958; GFX9-W64-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
1959; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1960; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1961; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
1962; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
1963; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
1964; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 4
1965; GFX9-W64-NEXT:    v_lshl_add_u32 v0, v2, 2, v0
1966; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen
1967; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
1968; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1969; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
1970; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
1971; GFX9-W64-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1972; GFX9-W64-NEXT:    s_endpgm
1973;
1974; GFX10-W32-LABEL: test_alloca:
1975; GFX10-W32:       ; %bb.0: ; %entry
1976; GFX10-W32-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
1977; GFX10-W32-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
1978; GFX10-W32-NEXT:    s_mov_b32 s10, -1
1979; GFX10-W32-NEXT:    s_mov_b32 s11, 0x31c16000
1980; GFX10-W32-NEXT:    s_add_u32 s8, s8, s0
1981; GFX10-W32-NEXT:    s_addc_u32 s9, s9, 0
1982; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
1983; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1984; GFX10-W32-NEXT:    v_mov_b32_e32 v3, 1
1985; GFX10-W32-NEXT:    v_lshl_add_u32 v2, v2, 2, 4
1986; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1987; GFX10-W32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1988; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1989; GFX10-W32-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:4
1990; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
1991; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1992; GFX10-W32-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 idxen
1993; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
1994; GFX10-W32-NEXT:    buffer_load_dword v0, v2, s[8:11], 0 offen
1995; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
1996; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1997; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
1998; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
1999; GFX10-W32-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
2000; GFX10-W32-NEXT:    s_endpgm
2001entry:
2002  %array = alloca [32 x i32], align 4, addrspace(5)
2003
2004  call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i32 0)
2005
2006  %s.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 0
2007  store volatile i32 %a, i32 addrspace(5)* %s.gep, align 4
2008
2009  call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> undef, i32 1, i32 0, i32 0, i32 0)
2010
2011  %c.gep = getelementptr [32 x i32], [32 x i32] addrspace(5)* %array, i32 0, i32 %idx
2012  %c = load i32, i32 addrspace(5)* %c.gep, align 4
2013  %c.bc = bitcast i32 %c to float
2014  %t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2015  call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i32 0)
2016
2017  ret void
2018}
2019
2020; Must return to exact at the end of a non-void returning shader,
2021; otherwise the EXEC mask exported by the epilog will be wrong. This is true
2022; even if the shader has no kills, because a kill could have happened in a
2023; previous shader fragment.
2024; CHECK-LABEL: {{^}}test_nonvoid_return:
2025; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
2026; CHECK: s_wqm_b64 exec, exec
2027; CHECK: s_and_b64 exec, exec, [[LIVE]]
2028; CHECK-NOT: exec
2029define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
2030; GFX9-W64-LABEL: test_nonvoid_return:
2031; GFX9-W64:       ; %bb.0:
2032; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2033; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2034; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2035; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[0:1]
2036; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2037; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2038; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2039; GFX9-W64-NEXT:    ; return to shader part epilog
2040;
2041; GFX10-W32-LABEL: test_nonvoid_return:
2042; GFX10-W32:       ; %bb.0:
2043; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2044; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2045; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2046; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s0
2047; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2048; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2049; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2050; GFX10-W32-NEXT:    ; return to shader part epilog
2051  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2052  %tex0 = extractelement <4 x float> %tex, i32 0
2053  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2054  ret <4 x float> %dtex
2055}
2056
2057; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
2058; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
2059; CHECK: s_wqm_b64 exec, exec
2060; CHECK: s_and_b64 exec, exec, [[LIVE]]
2061; CHECK-NOT: exec
2062define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
2063; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
2064; GFX9-W64:       ; %bb.0: ; %entry
2065; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2066; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
2067; GFX9-W64-NEXT:    s_and_b64 exec, exec, exec
2068; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2069; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2070; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2071; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB34_2
2072; GFX9-W64-NEXT:  ; %bb.1: ; %else
2073; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2074; GFX9-W64-NEXT:    s_branch .LBB34_3
2075; GFX9-W64-NEXT:  .LBB34_2: ; %if
2076; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2077; GFX9-W64-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2078; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2079; GFX9-W64-NEXT:  .LBB34_3:
2080;
2081; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
2082; GFX10-W32:       ; %bb.0: ; %entry
2083; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2084; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
2085; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, exec_lo
2086; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2087; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2088; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2089; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB34_2
2090; GFX10-W32-NEXT:  ; %bb.1: ; %else
2091; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2092; GFX10-W32-NEXT:    s_branch .LBB34_3
2093; GFX10-W32-NEXT:  .LBB34_2: ; %if
2094; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2095; GFX10-W32-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
2096; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2097; GFX10-W32-NEXT:  .LBB34_3:
2098entry:
2099  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2100  %tex0 = extractelement <4 x float> %tex, i32 0
2101  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2102  %cc = icmp sgt i32 %c, 0
2103  br i1 %cc, label %if, label %else
2104
2105if:
2106  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
2107  unreachable
2108
2109else:
2110  ret <4 x float> %dtex
2111}
2112
2113; Test awareness that s_wqm_b64 clobbers SCC.
2114; CHECK-LABEL: {{^}}test_scc:
2115; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
2116; CHECK: s_wqm_b64 exec, exec
2117; CHECK: s_cmp_
2118; CHECK-NEXT: s_cbranch_scc
2119; CHECK: ; %else
2120; CHECK: image_sample
2121; CHECK: ; %if
2122; CHECK: image_sample
2123; CHECK: ; %end
2124; CHECK: s_and_b64 exec, exec, [[ORIG]]
2125define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
2126; GFX9-W64-LABEL: test_scc:
2127; GFX9-W64:       ; %bb.0: ; %main_body
2128; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2129; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v0
2130; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2131; GFX9-W64-NEXT:    s_cmp_lt_i32 s0, 1
2132; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB35_2
2133; GFX9-W64-NEXT:  ; %bb.1: ; %else
2134; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2135; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 1
2136; GFX9-W64-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
2137; GFX9-W64-NEXT:    s_cbranch_execz .LBB35_3
2138; GFX9-W64-NEXT:    s_branch .LBB35_4
2139; GFX9-W64-NEXT:  .LBB35_2:
2140; GFX9-W64-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2141; GFX9-W64-NEXT:  .LBB35_3: ; %if
2142; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2143; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2144; GFX9-W64-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
2145; GFX9-W64-NEXT:  .LBB35_4: ; %end
2146; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2147; GFX9-W64-NEXT:    v_mov_b32_e32 v5, 1.0
2148; GFX9-W64-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2149; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2150; GFX9-W64-NEXT:    ; return to shader part epilog
2151;
2152; GFX10-W32-LABEL: test_scc:
2153; GFX10-W32:       ; %bb.0: ; %main_body
2154; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v0
2155; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2156; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2157; GFX10-W32-NEXT:    s_cmp_lt_i32 s0, 1
2158; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB35_2
2159; GFX10-W32-NEXT:  ; %bb.1: ; %else
2160; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2161; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 1
2162; GFX10-W32-NEXT:    image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
2163; GFX10-W32-NEXT:    s_cbranch_execz .LBB35_3
2164; GFX10-W32-NEXT:    s_branch .LBB35_4
2165; GFX10-W32-NEXT:  .LBB35_2:
2166; GFX10-W32-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
2167; GFX10-W32-NEXT:  .LBB35_3: ; %if
2168; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2169; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2170; GFX10-W32-NEXT:    image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
2171; GFX10-W32-NEXT:  .LBB35_4: ; %end
2172; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s1
2173; GFX10-W32-NEXT:    v_mov_b32_e32 v5, 1.0
2174; GFX10-W32-NEXT:    buffer_store_dword v5, v4, s[0:3], 0 idxen
2175; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2176; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2177; GFX10-W32-NEXT:    ; return to shader part epilog
2178main_body:
2179  %cc = icmp sgt i32 %sel, 0
2180  br i1 %cc, label %if, label %else
2181
2182if:
2183  %r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2184  br label %end
2185
2186else:
2187  %r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
2188  br label %end
2189
2190end:
2191  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
2192  call void @llvm.amdgcn.struct.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2193  ret <4 x float> %r
2194}
2195
2196; Check a case of a block being entirely WQM except for a bit of WWM.
2197; There was a bug where it forgot to enter and leave WWM.
2198define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2199; GFX9-W64-LABEL: test_wwm_within_wqm:
2200; GFX9-W64:       ; %bb.0: ; %main_body
2201; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2202; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2203; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2204; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2205; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2206; GFX9-W64-NEXT:    s_cbranch_execz .LBB36_2
2207; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2208; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2209; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2210; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2211; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2212; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2213; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2214; GFX9-W64-NEXT:    s_not_b64 exec, exec
2215; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2216; GFX9-W64-NEXT:    s_not_b64 exec, exec
2217; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2218; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2219; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2220; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2221; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2222; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2223; GFX9-W64-NEXT:  .LBB36_2: ; %ENDIF
2224; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2225; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2226; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2227; GFX9-W64-NEXT:    ; return to shader part epilog
2228;
2229; GFX10-W32-LABEL: test_wwm_within_wqm:
2230; GFX10-W32:       ; %bb.0: ; %main_body
2231; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2232; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2233; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2234; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2235; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2236; GFX10-W32-NEXT:    s_cbranch_execz .LBB36_2
2237; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2238; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2239; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2240; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2241; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2242; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2243; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2244; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2245; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2246; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2247; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2248; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2249; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2250; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2252; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2253; GFX10-W32-NEXT:  .LBB36_2: ; %ENDIF
2254; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2255; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2256; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2257; GFX10-W32-NEXT:    ; return to shader part epilog
2258main_body:
2259  %cmp = icmp eq i32 %z, 0
2260  br i1 %cmp, label %IF, label %ENDIF
2261
2262IF:
2263  %c.bc = bitcast i32 %c to float
2264  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2265  %tex0 = extractelement <4 x float> %tex, i32 0
2266  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2267  %dataf = extractelement <4 x float> %dtex, i32 0
2268  %data1 = fptosi float %dataf to i32
2269  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2270  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2271  %data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
2272  %data4f = sitofp i32 %data4 to float
2273  br label %ENDIF
2274
2275ENDIF:
2276  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2277  ret float %r
2278}
2279
2280; Check that WWM is triggered by the strict_wwm intrinsic.
2281define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
2282; GFX9-W64-LABEL: test_strict_wwm1:
2283; GFX9-W64:       ; %bb.0: ; %main_body
2284; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2285; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2286; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2287; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2288; GFX9-W64-NEXT:    s_nop 0
2289; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2290; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2291; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2292; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2293; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2294; GFX9-W64-NEXT:    ; return to shader part epilog
2295;
2296; GFX10-W32-LABEL: test_strict_wwm1:
2297; GFX10-W32:       ; %bb.0: ; %main_body
2298; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2299; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2300; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2301; GFX10-W32-NEXT:    s_clause 0x1
2302; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2303; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2304; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2305; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2306; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2307; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2308; GFX10-W32-NEXT:    ; return to shader part epilog
2309main_body:
2310  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2311  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2312  %out = fadd float %src0, %src1
2313  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2314  ret float %out.0
2315}
2316
2317; Same as above, but with an integer type.
2318define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
2319; GFX9-W64-LABEL: test_strict_wwm2:
2320; GFX9-W64:       ; %bb.0: ; %main_body
2321; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2322; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2323; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
2324; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2325; GFX9-W64-NEXT:    s_nop 0
2326; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2327; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2328; GFX9-W64-NEXT:    v_add_u32_e32 v1, v1, v2
2329; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2330; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2331; GFX9-W64-NEXT:    ; return to shader part epilog
2332;
2333; GFX10-W32-LABEL: test_strict_wwm2:
2334; GFX10-W32:       ; %bb.0: ; %main_body
2335; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2336; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2337; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
2338; GFX10-W32-NEXT:    s_clause 0x1
2339; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2340; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 idxen
2341; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2342; GFX10-W32-NEXT:    v_add_nc_u32_e32 v1, v1, v2
2343; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2344; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2345; GFX10-W32-NEXT:    ; return to shader part epilog
2346main_body:
2347  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2348  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2349  %src0.0 = bitcast float %src0 to i32
2350  %src1.0 = bitcast float %src1 to i32
2351  %out = add i32 %src0.0, %src1.0
2352  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2353  %out.1 = bitcast i32 %out.0 to float
2354  ret float %out.1
2355}
2356
2357; Check that we don't leave WWM on for computations that don't require WWM,
2358; since that will lead clobbering things that aren't supposed to be clobbered
2359; in cases like this.
2360; We enforce this by checking that v_add gets emitted in the same block as
2361; WWM computations.
2362define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
2363; GFX9-W64-LABEL: test_strict_wwm3:
2364; GFX9-W64:       ; %bb.0: ; %main_body
2365; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2366; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2367; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2368; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2369; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2370; GFX9-W64-NEXT:    s_cbranch_execz .LBB39_2
2371; GFX9-W64-NEXT:  ; %bb.1: ; %if
2372; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2373; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2374; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2375; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2376; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v1
2377; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2378; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2379; GFX9-W64-NEXT:    v_add_f32_e32 v0, v1, v0
2380; GFX9-W64-NEXT:  .LBB39_2: ; %endif
2381; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2382; GFX9-W64-NEXT:    ; return to shader part epilog
2383;
2384; GFX10-W32-LABEL: test_strict_wwm3:
2385; GFX10-W32:       ; %bb.0: ; %main_body
2386; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2387; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2388; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2389; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2390; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2391; GFX10-W32-NEXT:    s_cbranch_execz .LBB39_2
2392; GFX10-W32-NEXT:  ; %bb.1: ; %if
2393; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2394; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2395; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2396; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2397; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v1
2398; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2399; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2400; GFX10-W32-NEXT:    v_add_f32_e32 v0, v1, v0
2401; GFX10-W32-NEXT:  .LBB39_2: ; %endif
2402; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2403; GFX10-W32-NEXT:    ; return to shader part epilog
2404main_body:
2405  ; use mbcnt to make sure the branch is divergent
2406  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2407  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2408  %cc = icmp uge i32 %hi, 16
2409  br i1 %cc, label %endif, label %if
2410
2411if:
2412  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2413  %out = fadd float %src, %src
2414  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2415  %out.1 = fadd float %src, %out.0
2416  br label %endif
2417
2418endif:
2419  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
2420  ret float %out.2
2421}
2422
2423; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
2424; write could clobber disabled channels in the non-WWM one.
2425; We enforce this by checking that v_mov gets emitted in the same block as
2426; WWM computations.
2427define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
2428; GFX9-W64-LABEL: test_strict_wwm4:
2429; GFX9-W64:       ; %bb.0: ; %main_body
2430; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2431; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2432; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2433; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2434; GFX9-W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2435; GFX9-W64-NEXT:    s_cbranch_execz .LBB40_2
2436; GFX9-W64-NEXT:  ; %bb.1: ; %if
2437; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2438; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2439; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2440; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2441; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2442; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2443; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2444; GFX9-W64-NEXT:  .LBB40_2: ; %endif
2445; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[2:3]
2446; GFX9-W64-NEXT:    ; return to shader part epilog
2447;
2448; GFX10-W32-LABEL: test_strict_wwm4:
2449; GFX10-W32:       ; %bb.0: ; %main_body
2450; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2451; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2452; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2453; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2454; GFX10-W32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
2455; GFX10-W32-NEXT:    s_cbranch_execz .LBB40_2
2456; GFX10-W32-NEXT:  ; %bb.1: ; %if
2457; GFX10-W32-NEXT:    s_or_saveexec_b32 s2, -1
2458; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2459; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2460; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2461; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2462; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s2
2463; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2464; GFX10-W32-NEXT:  .LBB40_2: ; %endif
2465; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2466; GFX10-W32-NEXT:    ; return to shader part epilog
2467main_body:
2468  ; use mbcnt to make sure the branch is divergent
2469  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2470  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2471  %cc = icmp uge i32 %hi, 16
2472  br i1 %cc, label %endif, label %if
2473
2474if:
2475  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2476  %out = fadd float %src, %src
2477  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2478  br label %endif
2479
2480endif:
2481  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2482  ret float %out.1
2483}
2484
2485; Make sure the transition from Exact to WWM then WQM works properly.
2486define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
2487; GFX9-W64-LABEL: test_strict_wwm5:
2488; GFX9-W64:       ; %bb.0: ; %main_body
2489; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2490; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s0
2491; GFX9-W64-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2492; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2493; GFX9-W64-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2494; GFX9-W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
2495; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2496; GFX9-W64-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2497; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2498; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v1
2499; GFX9-W64-NEXT:    s_mov_b64 exec, s[4:5]
2500; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2501; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2502; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2503; GFX9-W64-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
2504; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[2:3]
2505; GFX9-W64-NEXT:    ; return to shader part epilog
2506;
2507; GFX10-W32-LABEL: test_strict_wwm5:
2508; GFX10-W32:       ; %bb.0: ; %main_body
2509; GFX10-W32-NEXT:    v_mov_b32_e32 v0, s0
2510; GFX10-W32-NEXT:    s_mov_b32 s2, exec_lo
2511; GFX10-W32-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 idxen
2512; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2513; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2514; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2515; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2516; GFX10-W32-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 idxen
2517; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2518; GFX10-W32-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 idxen
2519; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2520; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v1
2521; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2522; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2523; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2524; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2525; GFX10-W32-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $exec killed $exec
2526; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
2527; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
2528; GFX10-W32-NEXT:    ; return to shader part epilog
2529main_body:
2530  %src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2531  call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
2532  %src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
2533  %temp = fadd float %src1, %src1
2534  %temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
2535  %out = fadd float %temp.0, %temp.0
2536  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
2537  ret float %out.0
2538}
2539
2540; Check that WWM is turned on correctly across basic block boundaries.
2541; if..then..endif version
2542;SI-CHECK: buffer_load_dword
2543;VI-CHECK: flat_load_dword
2544;SI-CHECK: buffer_load_dword
2545;VI-CHECK: flat_load_dword
2546define amdgpu_ps float @test_strict_wwm6_then() {
2547; GFX9-W64-LABEL: test_strict_wwm6_then:
2548; GFX9-W64:       ; %bb.0: ; %main_body
2549; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2550; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2551; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2552; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2553; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2554; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2555; GFX9-W64-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
2556; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2557; GFX9-W64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2558; GFX9-W64-NEXT:    s_cbranch_execz .LBB42_2
2559; GFX9-W64-NEXT:  ; %bb.1: ; %if
2560; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2561; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2562; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2563; GFX9-W64-NEXT:    v_add_f32_e32 v1, v1, v2
2564; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2565; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2566; GFX9-W64-NEXT:  .LBB42_2: ; %endif
2567; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2568; GFX9-W64-NEXT:    ; return to shader part epilog
2569;
2570; GFX10-W32-LABEL: test_strict_wwm6_then:
2571; GFX10-W32:       ; %bb.0: ; %main_body
2572; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2573; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2574; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2575; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2576; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2577; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
2578; GFX10-W32-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 16, v0
2579; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2580; GFX10-W32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2581; GFX10-W32-NEXT:    s_cbranch_execz .LBB42_2
2582; GFX10-W32-NEXT:  ; %bb.1: ; %if
2583; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2584; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2585; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2586; GFX10-W32-NEXT:    v_add_f32_e32 v1, v1, v2
2587; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2588; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2589; GFX10-W32-NEXT:  .LBB42_2: ; %endif
2590; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2591; GFX10-W32-NEXT:    ; return to shader part epilog
2592main_body:
2593  %src0 = load volatile float, float addrspace(1)* undef
2594  ; use mbcnt to make sure the branch is divergent
2595  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2596  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2597  %cc = icmp uge i32 %hi, 16
2598  br i1 %cc, label %endif, label %if
2599
2600if:
2601  %src1 = load volatile float, float addrspace(1)* undef
2602  %out = fadd float %src0, %src1
2603  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2604  br label %endif
2605
2606endif:
2607  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
2608  ret float %out.1
2609}
2610
2611; Check that WWM is turned on correctly across basic block boundaries.
2612; loop version
2613define amdgpu_ps float @test_strict_wwm6_loop() {
2614; GFX9-W64-LABEL: test_strict_wwm6_loop:
2615; GFX9-W64:       ; %bb.0: ; %main_body
2616; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2617; GFX9-W64-NEXT:    global_load_dword v1, v[3:4], off glc
2618; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2619; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2620; GFX9-W64-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2621; GFX9-W64-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2622; GFX9-W64-NEXT:    s_mov_b64 s[0:1], 0
2623; GFX9-W64-NEXT:  .LBB43_1: ; %loop
2624; GFX9-W64-NEXT:    ; =>This Inner Loop Header: Depth=1
2625; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2626; GFX9-W64-NEXT:    global_load_dword v2, v[3:4], off glc
2627; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2628; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2629; GFX9-W64-NEXT:    v_add_u32_e32 v3, -1, v3
2630; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2631; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2632; GFX9-W64-NEXT:    v_add_f32_e32 v2, v1, v2
2633; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2634; GFX9-W64-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
2635; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2636; GFX9-W64-NEXT:    s_andn2_b64 exec, exec, s[0:1]
2637; GFX9-W64-NEXT:    s_cbranch_execnz .LBB43_1
2638; GFX9-W64-NEXT:  ; %bb.2: ; %endloop
2639; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[0:1]
2640; GFX9-W64-NEXT:    ; return to shader part epilog
2641;
2642; GFX10-W32-LABEL: test_strict_wwm6_loop:
2643; GFX10-W32:       ; %bb.0: ; %main_body
2644; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2645; GFX10-W32-NEXT:    global_load_dword v1, v[3:4], off glc dlc
2646; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2647; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2648; GFX10-W32-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
2649; GFX10-W32-NEXT:    s_mov_b32 s0, 0
2650; GFX10-W32-NEXT:    v_mbcnt_hi_u32_b32 v3, -1, v0
2651; GFX10-W32-NEXT:  .LBB43_1: ; %loop
2652; GFX10-W32-NEXT:    ; =>This Inner Loop Header: Depth=1
2653; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2654; GFX10-W32-NEXT:    global_load_dword v2, v[3:4], off glc dlc
2655; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2656; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2657; GFX10-W32-NEXT:    v_add_nc_u32_e32 v3, -1, v3
2658; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2659; GFX10-W32-NEXT:    v_add_f32_e32 v2, v1, v2
2660; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2661; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
2662; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2663; GFX10-W32-NEXT:    s_or_b32 s0, vcc_lo, s0
2664; GFX10-W32-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
2665; GFX10-W32-NEXT:    s_cbranch_execnz .LBB43_1
2666; GFX10-W32-NEXT:  ; %bb.2: ; %endloop
2667; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2668; GFX10-W32-NEXT:    ; return to shader part epilog
2669main_body:
2670  %src0 = load volatile float, float addrspace(1)* undef
2671  ; use mbcnt to make sure the branch is divergent
2672  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
2673  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
2674  br label %loop
2675
2676loop:
2677  %counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
2678  %src1 = load volatile float, float addrspace(1)* undef
2679  %out = fadd float %src0, %src1
2680  %out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
2681  %counter.1 = sub i32 %counter, 1
2682  %cc = icmp ne i32 %counter.1, 0
2683  br i1 %cc, label %loop, label %endloop
2684
2685endloop:
2686  ret float %out.0
2687}
2688
2689; Check that @llvm.amdgcn.set.inactive disables WWM.
2690define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
2691; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
2692; GFX9-W64:       ; %bb.0: ; %main_body
2693; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
2694; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2695; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2696; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2697; GFX9-W64-NEXT:    s_not_b64 exec, exec
2698; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2699; GFX9-W64-NEXT:    s_not_b64 exec, exec
2700; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2701; GFX9-W64-NEXT:    v_add_u32_e32 v0, v0, v0
2702; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2703; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2704; GFX9-W64-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2705; GFX9-W64-NEXT:    s_endpgm
2706;
2707; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
2708; GFX10-W32:       ; %bb.0: ; %main_body
2709; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
2710; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 idxen
2711; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2712; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2713; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2714; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2715; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2716; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2717; GFX10-W32-NEXT:    v_add_nc_u32_e32 v0, v0, v0
2718; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2719; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2720; GFX10-W32-NEXT:    buffer_store_dword v2, v1, s[0:3], 0 idxen
2721; GFX10-W32-NEXT:    s_endpgm
2722main_body:
2723  %src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2724  %src.0 = bitcast float %src to i32
2725  %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
2726  %out = add i32 %src.1, %src.1
2727  %out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
2728  %out.1 = bitcast i32 %out.0 to float
2729  call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
2730  ret void
2731}
2732
2733; Check a case of a block being entirely WQM except for a bit of WWM.
2734; There was a bug where it forgot to enter and leave WWM.
2735define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2736; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
2737; GFX9-W64:       ; %bb.0: ; %main_body
2738; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2739; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2740; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2741; GFX9-W64-NEXT:    v_mov_b32_e32 v1, 0
2742; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2743; GFX9-W64-NEXT:    s_cbranch_execz .LBB45_2
2744; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2745; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2746; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2747; GFX9-W64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
2748; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2749; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v0, v0
2750; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2751; GFX9-W64-NEXT:    s_not_b64 exec, exec
2752; GFX9-W64-NEXT:    v_mov_b32_e32 v2, 0
2753; GFX9-W64-NEXT:    s_not_b64 exec, exec
2754; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
2755; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2756; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2757; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2758; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2759; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v1, v0
2760; GFX9-W64-NEXT:  .LBB45_2: ; %ENDIF
2761; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2762; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2763; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v1
2764; GFX9-W64-NEXT:    ; return to shader part epilog
2765;
2766; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
2767; GFX10-W32:       ; %bb.0: ; %main_body
2768; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2769; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2770; GFX10-W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
2771; GFX10-W32-NEXT:    v_mov_b32_e32 v1, 0
2772; GFX10-W32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
2773; GFX10-W32-NEXT:    s_cbranch_execz .LBB45_2
2774; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2775; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2776; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2777; GFX10-W32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2778; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2779; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v0, v0
2780; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2781; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2782; GFX10-W32-NEXT:    v_mov_b32_e32 v2, 0
2783; GFX10-W32-NEXT:    s_not_b32 exec_lo, exec_lo
2784; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
2785; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2786; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2787; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2788; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2789; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v1, v0
2790; GFX10-W32-NEXT:  .LBB45_2: ; %ENDIF
2791; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2792; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2793; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v1
2794; GFX10-W32-NEXT:    ; return to shader part epilog
2795main_body:
2796  %cmp = icmp eq i32 %z, 0
2797  br i1 %cmp, label %IF, label %ENDIF
2798
2799IF:
2800  %c.bc = bitcast i32 %c to float
2801  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2802  %tex0 = extractelement <4 x float> %tex, i32 0
2803  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2804  %dataf = extractelement <4 x float> %dtex, i32 0
2805  %data1 = fptosi float %dataf to i32
2806  %data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
2807  %data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
2808  %data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
2809  %data4f = sitofp i32 %data4 to float
2810  br label %ENDIF
2811
2812ENDIF:
2813  %r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
2814  ret float %r
2815}
2816
2817; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
2818define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
2819; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
2820; GFX9-W64:       ; %bb.0: ; %main_body
2821; GFX9-W64-NEXT:    s_mov_b64 s[12:13], exec
2822; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2823; GFX9-W64-NEXT:    s_mov_b64 s[14:15], exec
2824; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2825; GFX9-W64-NEXT:    v_mov_b32_e32 v2, v0
2826; GFX9-W64-NEXT:    s_mov_b64 exec, s[14:15]
2827; GFX9-W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
2828; GFX9-W64-NEXT:    v_mov_b32_e32 v0, 0
2829; GFX9-W64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
2830; GFX9-W64-NEXT:    s_cbranch_execz .LBB46_2
2831; GFX9-W64-NEXT:  ; %bb.1: ; %IF
2832; GFX9-W64-NEXT:    s_mov_b64 s[16:17], exec
2833; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2834; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2835; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2836; GFX9-W64-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
2837; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2838; GFX9-W64-NEXT:    v_cvt_i32_f32_e32 v2, v2
2839; GFX9-W64-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2840; GFX9-W64-NEXT:    s_mov_b64 exec, s[16:17]
2841; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
2842; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2843; GFX9-W64-NEXT:    v_cvt_f32_i32_e32 v0, v0
2844; GFX9-W64-NEXT:  .LBB46_2: ; %ENDIF
2845; GFX9-W64-NEXT:    s_or_b64 exec, exec, s[14:15]
2846; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[12:13]
2847; GFX9-W64-NEXT:    ; return to shader part epilog
2848;
2849; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
2850; GFX10-W32:       ; %bb.0: ; %main_body
2851; GFX10-W32-NEXT:    s_mov_b32 s12, exec_lo
2852; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2853; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2854; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2855; GFX10-W32-NEXT:    v_mov_b32_e32 v2, v0
2856; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s13
2857; GFX10-W32-NEXT:    v_mov_b32_e32 v0, 0
2858; GFX10-W32-NEXT:    s_mov_b32 s13, exec_lo
2859; GFX10-W32-NEXT:    v_cmpx_eq_u32_e32 0, v1
2860; GFX10-W32-NEXT:    s_cbranch_execz .LBB46_2
2861; GFX10-W32-NEXT:  ; %bb.1: ; %IF
2862; GFX10-W32-NEXT:    s_mov_b32 s14, exec_lo
2863; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2864; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2865; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2866; GFX10-W32-NEXT:    image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
2867; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2868; GFX10-W32-NEXT:    v_cvt_i32_f32_e32 v2, v2
2869; GFX10-W32-NEXT:    ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
2870; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s14
2871; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
2872; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2873; GFX10-W32-NEXT:    v_cvt_f32_i32_e32 v0, v0
2874; GFX10-W32-NEXT:  .LBB46_2: ; %ENDIF
2875; GFX10-W32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
2876; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
2877; GFX10-W32-NEXT:    ; return to shader part epilog
2878main_body:
2879  %cmp = icmp eq i32 %z, 0
2880  br i1 %cmp, label %IF, label %ENDIF
2881
2882IF:
2883  %c.bc = bitcast i32 %c to float
2884  %tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2885  %tex0 = extractelement <4 x float> %tex, i32 0
2886  %dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
2887  %dataf = extractelement <4 x float> %dtex, i32 0
2888  %data1 = fptosi float %dataf to i32
2889  %data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
2890  %data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
2891  %data3f = sitofp i32 %data3 to float
2892  br label %ENDIF
2893
2894ENDIF:
2895  %r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
2896  ret float %r
2897}
2898
2899;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
2900define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, <4 x i32> inreg %res2, float %inp, <8 x i32> inreg %res3) {
2901; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
2902; GFX9-W64:       ; %bb.0: ; %main_body
2903; GFX9-W64-NEXT:    s_mov_b64 s[28:29], exec
2904; GFX9-W64-NEXT:    s_mov_b32 s19, s17
2905; GFX9-W64-NEXT:    s_mov_b64 s[30:31], exec
2906; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2907; GFX9-W64-NEXT:    s_mov_b32 s23, s5
2908; GFX9-W64-NEXT:    s_mov_b32 s22, s4
2909; GFX9-W64-NEXT:    s_mov_b32 s21, s3
2910; GFX9-W64-NEXT:    s_mov_b32 s20, s2
2911; GFX9-W64-NEXT:    s_mov_b32 s27, s9
2912; GFX9-W64-NEXT:    s_mov_b32 s26, s8
2913; GFX9-W64-NEXT:    s_mov_b32 s25, s7
2914; GFX9-W64-NEXT:    s_mov_b32 s24, s6
2915; GFX9-W64-NEXT:    s_mov_b32 s18, s16
2916; GFX9-W64-NEXT:    s_mov_b32 s17, s15
2917; GFX9-W64-NEXT:    s_mov_b32 s16, s14
2918; GFX9-W64-NEXT:    s_mov_b32 s15, s13
2919; GFX9-W64-NEXT:    s_mov_b32 s14, s12
2920; GFX9-W64-NEXT:    s_mov_b32 s13, s11
2921; GFX9-W64-NEXT:    s_mov_b32 s12, s10
2922; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s1
2923; GFX9-W64-NEXT:    s_mov_b64 exec, s[30:31]
2924; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2925; GFX9-W64-NEXT:    s_mov_b64 s[2:3], exec
2926; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2927; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
2928; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2929; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
2930; GFX9-W64-NEXT:    v_mov_b32_e32 v3, s0
2931; GFX9-W64-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
2932; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
2933; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
2934; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2935; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
2936; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
2937; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
2938; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
2939; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
2940; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
2941; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2942; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
2943; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
2944; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[28:29]
2945; GFX9-W64-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
2946; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2947; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2948; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
2949; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
2950; GFX9-W64-NEXT:    ; return to shader part epilog
2951;
2952; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
2953; GFX10-W32:       ; %bb.0: ; %main_body
2954; GFX10-W32-NEXT:    s_mov_b32 s28, exec_lo
2955; GFX10-W32-NEXT:    s_mov_b32 s19, s17
2956; GFX10-W32-NEXT:    s_mov_b32 s29, exec_lo
2957; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2958; GFX10-W32-NEXT:    s_mov_b32 s23, s5
2959; GFX10-W32-NEXT:    s_mov_b32 s22, s4
2960; GFX10-W32-NEXT:    s_mov_b32 s21, s3
2961; GFX10-W32-NEXT:    s_mov_b32 s20, s2
2962; GFX10-W32-NEXT:    s_mov_b32 s27, s9
2963; GFX10-W32-NEXT:    s_mov_b32 s26, s8
2964; GFX10-W32-NEXT:    s_mov_b32 s25, s7
2965; GFX10-W32-NEXT:    s_mov_b32 s24, s6
2966; GFX10-W32-NEXT:    s_mov_b32 s18, s16
2967; GFX10-W32-NEXT:    s_mov_b32 s17, s15
2968; GFX10-W32-NEXT:    s_mov_b32 s16, s14
2969; GFX10-W32-NEXT:    s_mov_b32 s15, s13
2970; GFX10-W32-NEXT:    s_mov_b32 s14, s12
2971; GFX10-W32-NEXT:    s_mov_b32 s13, s11
2972; GFX10-W32-NEXT:    s_mov_b32 s12, s10
2973; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s1
2974; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s29
2975; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2976; GFX10-W32-NEXT:    s_mov_b32 s1, exec_lo
2977; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2978; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[20:23], 0 idxen
2979; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2980; GFX10-W32-NEXT:    s_or_saveexec_b32 s1, -1
2981; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s0
2982; GFX10-W32-NEXT:    buffer_load_dword v3, v3, s[24:27], 0 idxen
2983; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s1
2984; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
2985; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2986; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
2987; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
2988; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
2989; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
2990; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
2991; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2992; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
2993; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
2994; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
2995; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
2996; GFX10-W32-NEXT:    image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
2997; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
2998; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[20:23], 0 idxen
2999; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[20:23], 0 idxen
3000; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3001; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3002; GFX10-W32-NEXT:    ; return to shader part epilog
3003main_body:
3004  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3005  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3006  %temp = fadd float %reload, %reload
3007  %temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
3008  %temp3 = fadd float %temp2, %temp2
3009  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res2, i32 %idx0, i32 0, i32 0, i32 0)
3010  %temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
3011  %temp5 = fadd float %temp3, %temp4
3012  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res, i1 false, i32 0, i32 0)
3013  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3014  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3015  ret float %out
3016}
3017
3018define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
3019; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
3020; GFX9-W64:       ; %bb.0: ; %main_body
3021; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3022; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3023; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3024; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3025; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3026; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3027; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3028; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3029; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3030; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3031; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3032; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3033; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3034; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3035; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3036; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3037; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3038; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3039; GFX9-W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
3040; GFX9-W64-NEXT:    v_mov_b32_e32 v2, s1
3041; GFX9-W64-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3042; GFX9-W64-NEXT:    s_mov_b64 exec, s[2:3]
3043; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3044; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3045; GFX9-W64-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3046; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3047; GFX9-W64-NEXT:    s_or_saveexec_b64 s[0:1], -1
3048; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3049; GFX9-W64-NEXT:    v_add_f32_e32 v2, v2, v2
3050; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3051; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3052; GFX9-W64-NEXT:    v_mov_b32_e32 v0, v2
3053; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3054; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3055; GFX9-W64-NEXT:    v_mov_b32_e32 v4, v3
3056; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v4
3057; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3058; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3059; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3060; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3061; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3062; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3063; GFX9-W64-NEXT:    ; return to shader part epilog
3064;
3065; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
3066; GFX10-W32:       ; %bb.0: ; %main_body
3067; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3068; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3069; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3070; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3071; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3072; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3073; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3074; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3075; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3076; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3077; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3078; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3079; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3080; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3081; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3082; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3083; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3084; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3085; GFX10-W32-NEXT:    v_mov_b32_e32 v2, s1
3086; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3087; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3088; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3089; GFX10-W32-NEXT:    buffer_load_dword v2, v2, s[16:19], 0 idxen
3090; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3091; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3092; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3093; GFX10-W32-NEXT:    buffer_load_dword v3, v1, s[16:19], 0 idxen
3094; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3095; GFX10-W32-NEXT:    s_or_saveexec_b32 s0, -1
3096; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3097; GFX10-W32-NEXT:    v_add_f32_e32 v2, v2, v2
3098; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3099; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3100; GFX10-W32-NEXT:    v_mov_b32_e32 v0, v2
3101; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3102; GFX10-W32-NEXT:    v_mov_b32_e32 v4, v3
3103; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3104; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v4
3105; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3106; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3107; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3108; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3109; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3110; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3111; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3112; GFX10-W32-NEXT:    ; return to shader part epilog
3113main_body:
3114  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3115  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3116  %temp = fadd float %reload, %reload
3117  %temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
3118  %temp3 = fadd float %temp2, %temp2
3119  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3120  %temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3121  %temp5 = fadd float %temp3, %temp4
3122  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3123  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3124  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3125  ret float %out
3126}
3127
3128;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
3129define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, <4 x i32> inreg %res, float %inp, <8 x i32> inreg %res2) {
3130; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
3131; GFX9-W64:       ; %bb.0: ; %main_body
3132; GFX9-W64-NEXT:    s_mov_b64 s[20:21], exec
3133; GFX9-W64-NEXT:    s_mov_b32 s15, s13
3134; GFX9-W64-NEXT:    s_mov_b64 s[22:23], exec
3135; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3136; GFX9-W64-NEXT:    s_mov_b32 s19, s5
3137; GFX9-W64-NEXT:    s_mov_b32 s18, s4
3138; GFX9-W64-NEXT:    s_mov_b32 s17, s3
3139; GFX9-W64-NEXT:    s_mov_b32 s16, s2
3140; GFX9-W64-NEXT:    s_mov_b32 s14, s12
3141; GFX9-W64-NEXT:    s_mov_b32 s13, s11
3142; GFX9-W64-NEXT:    s_mov_b32 s12, s10
3143; GFX9-W64-NEXT:    s_mov_b32 s11, s9
3144; GFX9-W64-NEXT:    s_mov_b32 s10, s8
3145; GFX9-W64-NEXT:    s_mov_b32 s9, s7
3146; GFX9-W64-NEXT:    s_mov_b32 s8, s6
3147; GFX9-W64-NEXT:    v_mov_b32_e32 v1, s0
3148; GFX9-W64-NEXT:    s_mov_b64 exec, s[22:23]
3149; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3150; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3151; GFX9-W64-NEXT:    v_mov_b32_e32 v0, s1
3152; GFX9-W64-NEXT:    buffer_load_dword v0, v0, s[16:19], 0 idxen
3153; GFX9-W64-NEXT:    s_mov_b64 s[0:1], exec
3154; GFX9-W64-NEXT:    s_wqm_b64 exec, exec
3155; GFX9-W64-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3156; GFX9-W64-NEXT:    s_mov_b64 exec, s[0:1]
3157; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3158; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3159; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3160; GFX9-W64-NEXT:    s_waitcnt vmcnt(1)
3161; GFX9-W64-NEXT:    v_mov_b32_e32 v3, v2
3162; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3163; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v0
3164; GFX9-W64-NEXT:    v_add_f32_e32 v0, v0, v3
3165; GFX9-W64-NEXT:    s_and_b64 exec, exec, s[20:21]
3166; GFX9-W64-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
3167; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3168; GFX9-W64-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3169; GFX9-W64-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3170; GFX9-W64-NEXT:    s_waitcnt vmcnt(0)
3171; GFX9-W64-NEXT:    ; return to shader part epilog
3172;
3173; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
3174; GFX10-W32:       ; %bb.0: ; %main_body
3175; GFX10-W32-NEXT:    s_mov_b32 s20, exec_lo
3176; GFX10-W32-NEXT:    s_mov_b32 s15, s13
3177; GFX10-W32-NEXT:    s_mov_b32 s21, exec_lo
3178; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3179; GFX10-W32-NEXT:    s_mov_b32 s19, s5
3180; GFX10-W32-NEXT:    s_mov_b32 s18, s4
3181; GFX10-W32-NEXT:    s_mov_b32 s17, s3
3182; GFX10-W32-NEXT:    s_mov_b32 s16, s2
3183; GFX10-W32-NEXT:    s_mov_b32 s14, s12
3184; GFX10-W32-NEXT:    s_mov_b32 s13, s11
3185; GFX10-W32-NEXT:    s_mov_b32 s12, s10
3186; GFX10-W32-NEXT:    s_mov_b32 s11, s9
3187; GFX10-W32-NEXT:    s_mov_b32 s10, s8
3188; GFX10-W32-NEXT:    s_mov_b32 s9, s7
3189; GFX10-W32-NEXT:    s_mov_b32 s8, s6
3190; GFX10-W32-NEXT:    v_mov_b32_e32 v1, s0
3191; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s21
3192; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3193; GFX10-W32-NEXT:    v_mov_b32_e32 v3, s1
3194; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3195; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3196; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3197; GFX10-W32-NEXT:    buffer_load_dword v0, v3, s[16:19], 0 idxen
3198; GFX10-W32-NEXT:    s_mov_b32 s0, exec_lo
3199; GFX10-W32-NEXT:    s_wqm_b32 exec_lo, exec_lo
3200; GFX10-W32-NEXT:    buffer_load_dword v2, v1, s[16:19], 0 idxen
3201; GFX10-W32-NEXT:    s_mov_b32 exec_lo, s0
3202; GFX10-W32-NEXT:    s_waitcnt vmcnt(1)
3203; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3204; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3205; GFX10-W32-NEXT:    v_mov_b32_e32 v3, v2
3206; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3207; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3208; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v0
3209; GFX10-W32-NEXT:    v_add_f32_e32 v0, v0, v3
3210; GFX10-W32-NEXT:    s_and_b32 exec_lo, exec_lo, s20
3211; GFX10-W32-NEXT:    image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
3212; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3213; GFX10-W32-NEXT:    buffer_store_dword v0, v1, s[16:19], 0 idxen
3214; GFX10-W32-NEXT:    buffer_load_dword v0, v1, s[16:19], 0 idxen
3215; GFX10-W32-NEXT:    s_waitcnt vmcnt(0)
3216; GFX10-W32-NEXT:    s_waitcnt_vscnt null, 0x0
3217; GFX10-W32-NEXT:    ; return to shader part epilog
3218main_body:
3219  call void @llvm.amdgcn.struct.buffer.store.f32(float %inp, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3220  %reload = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx1, i32 0, i32 0, i32 0)
3221  %temp = fadd float %reload, %reload
3222  %tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3223  %temp2 = fadd float %tex, %tex
3224  %reload_wwm = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3225  %temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
3226  %temp4 = fadd float %temp2, %temp3
3227  %tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res, i1 false, i32 0, i32 0)
3228  call void @llvm.amdgcn.struct.buffer.store.f32(float %tex2, <4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3229  %out = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %res, i32 %idx0, i32 0, i32 0, i32 0)
3230  ret float %out
3231}
3232
3233; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
3234; vector comparisons in Wave32 mode.
3235define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(float addrspace(6)* inreg %0) {
3236; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
3237; GFX9-W64:       ; %bb.0: ; %main_body
3238; GFX9-W64-NEXT:    s_mov_b32 s3, 0x31016fac
3239; GFX9-W64-NEXT:    s_mov_b32 s2, 32
3240; GFX9-W64-NEXT:    s_mov_b32 s1, 0x8000
3241; GFX9-W64-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3242; GFX9-W64-NEXT:    s_waitcnt lgkmcnt(0)
3243; GFX9-W64-NEXT:    v_cmp_le_f32_e64 vcc, s0, 0
3244; GFX9-W64-NEXT:    s_andn2_b64 s[4:5], exec, vcc
3245; GFX9-W64-NEXT:    s_cbranch_scc0 .LBB50_1
3246; GFX9-W64-NEXT:    s_endpgm
3247; GFX9-W64-NEXT:  .LBB50_1:
3248; GFX9-W64-NEXT:    s_mov_b64 exec, 0
3249; GFX9-W64-NEXT:    exp null off, off, off, off done vm
3250; GFX9-W64-NEXT:    s_endpgm
3251;
3252; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
3253; GFX10-W32:       ; %bb.0: ; %main_body
3254; GFX10-W32-NEXT:    s_mov_b32 s3, 0x31016fac
3255; GFX10-W32-NEXT:    s_mov_b32 s2, 32
3256; GFX10-W32-NEXT:    s_mov_b32 s1, 0x8000
3257; GFX10-W32-NEXT:    s_buffer_load_dword s0, s[0:3], 0x0
3258; GFX10-W32-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX10-W32-NEXT:    v_cmp_le_f32_e64 vcc_lo, s0, 0
3260; GFX10-W32-NEXT:    s_andn2_b32 s4, exec_lo, vcc_lo
3261; GFX10-W32-NEXT:    s_cbranch_scc0 .LBB50_1
3262; GFX10-W32-NEXT:    s_endpgm
3263; GFX10-W32-NEXT:  .LBB50_1:
3264; GFX10-W32-NEXT:    s_mov_b32 exec_lo, 0
3265; GFX10-W32-NEXT:    exp null off, off, off, off done vm
3266; GFX10-W32-NEXT:    s_endpgm
3267main_body:
3268  %1 = ptrtoint float addrspace(6)* %0 to i32
3269  %2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
3270  %3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
3271  %4 = fcmp nsz arcp ugt float %3, 0.000000e+00
3272  call void @llvm.amdgcn.kill(i1 %4) #1
3273  ret void
3274}
3275
3276declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
3277declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
3278
3279declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
3280declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
3281declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
3282declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
3283declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
3284declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
3285
3286declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
3287declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3288declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3289declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
3290declare void @llvm.amdgcn.kill(i1) #1
3291declare float @llvm.amdgcn.wqm.f32(float) #3
3292declare i32 @llvm.amdgcn.wqm.i32(i32) #3
3293declare float @llvm.amdgcn.strict.wwm.f32(float) #3
3294declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
3295declare float @llvm.amdgcn.wwm.f32(float) #3
3296declare i32 @llvm.amdgcn.wwm.i32(i32) #3
3297declare float @llvm.amdgcn.strict.wqm.f32(float) #3
3298declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
3299declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
3300declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
3301declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
3302declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
3303declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
3304declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
3305declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
3306declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
3307declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
3308
3309attributes #1 = { nounwind }
3310attributes #2 = { nounwind readonly }
3311attributes #3 = { nounwind readnone }
3312attributes #4 = { nounwind readnone convergent }
3313attributes #5 = { "amdgpu-ps-wqm-outputs" }
3314attributes #6 = { nounwind "InitialPSInputAddr"="2" }
3315attributes #7 = { nounwind readnone willreturn }
3316