1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
3; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
8; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
9
10declare i1 @llvm.amdgcn.wqm.vote(i1)
11declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
12declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg)
13
14; Show what the atomic optimization pass will do for raw buffers.
15
16define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
17; GFX7-LABEL: add_i32_constant:
18; GFX7:       ; %bb.0: ; %entry
19; GFX7-NEXT:    s_mov_b64 s[10:11], exec
20; GFX7-NEXT:    ; implicit-def: $vgpr0
21; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
22; GFX7-NEXT:    s_cbranch_execz .LBB0_4
23; GFX7-NEXT:  ; %bb.1:
24; GFX7-NEXT:    s_mov_b64 s[12:13], exec
25; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
26; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s13, v0
27; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
28; GFX7-NEXT:    ; implicit-def: $vgpr1
29; GFX7-NEXT:    s_and_saveexec_b64 s[10:11], vcc
30; GFX7-NEXT:    s_cbranch_execz .LBB0_3
31; GFX7-NEXT:  ; %bb.2:
32; GFX7-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
33; GFX7-NEXT:    s_mul_i32 s12, s12, 5
34; GFX7-NEXT:    v_mov_b32_e32 v1, s12
35; GFX7-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
36; GFX7-NEXT:  .LBB0_3:
37; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
38; GFX7-NEXT:    s_waitcnt vmcnt(0)
39; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
40; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
41; GFX7-NEXT:  .LBB0_4: ; %Flow
42; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
43; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
44; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
45; GFX7-NEXT:    s_cbranch_vccnz .LBB0_6
46; GFX7-NEXT:  ; %bb.5: ; %if
47; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
48; GFX7-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
49; GFX7-NEXT:    s_endpgm
50;
51; GFX89-LABEL: add_i32_constant:
52; GFX89:       ; %bb.0: ; %entry
53; GFX89-NEXT:    s_mov_b64 s[10:11], exec
54; GFX89-NEXT:    ; implicit-def: $vgpr0
55; GFX89-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
56; GFX89-NEXT:    s_cbranch_execz .LBB0_4
57; GFX89-NEXT:  ; %bb.1:
58; GFX89-NEXT:    s_mov_b64 s[12:13], exec
59; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
60; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
61; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
62; GFX89-NEXT:    ; implicit-def: $vgpr1
63; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
64; GFX89-NEXT:    s_cbranch_execz .LBB0_3
65; GFX89-NEXT:  ; %bb.2:
66; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
67; GFX89-NEXT:    s_mul_i32 s12, s12, 5
68; GFX89-NEXT:    v_mov_b32_e32 v1, s12
69; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
70; GFX89-NEXT:  .LBB0_3:
71; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
72; GFX89-NEXT:    s_waitcnt vmcnt(0)
73; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
74; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
75; GFX89-NEXT:  .LBB0_4: ; %Flow
76; GFX89-NEXT:    s_or_b64 exec, exec, s[8:9]
77; GFX89-NEXT:    s_wqm_b64 s[4:5], -1
78; GFX89-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
79; GFX89-NEXT:    s_cbranch_vccnz .LBB0_6
80; GFX89-NEXT:  ; %bb.5: ; %if
81; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
82; GFX89-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
83; GFX89-NEXT:    s_endpgm
84;
85; GFX1064-LABEL: add_i32_constant:
86; GFX1064:       ; %bb.0: ; %entry
87; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
88; GFX1064-NEXT:    ; implicit-def: $vgpr0
89; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
90; GFX1064-NEXT:    s_cbranch_execz .LBB0_4
91; GFX1064-NEXT:  ; %bb.1:
92; GFX1064-NEXT:    s_mov_b64 s[12:13], exec
93; GFX1064-NEXT:    ; implicit-def: $vgpr1
94; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
95; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
96; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
97; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
98; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
99; GFX1064-NEXT:  ; %bb.2:
100; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
101; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
102; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
103; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
104; GFX1064-NEXT:  .LBB0_3:
105; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
106; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
107; GFX1064-NEXT:    s_waitcnt vmcnt(0)
108; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
109; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
110; GFX1064-NEXT:  .LBB0_4: ; %Flow
111; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
112; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
113; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
114; GFX1064-NEXT:    s_cbranch_vccnz .LBB0_6
115; GFX1064-NEXT:  ; %bb.5: ; %if
116; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
117; GFX1064-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
118; GFX1064-NEXT:    s_endpgm
119;
120; GFX1032-LABEL: add_i32_constant:
121; GFX1032:       ; %bb.0: ; %entry
122; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
123; GFX1032-NEXT:    ; implicit-def: $vgpr0
124; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
125; GFX1032-NEXT:    s_cbranch_execz .LBB0_4
126; GFX1032-NEXT:  ; %bb.1:
127; GFX1032-NEXT:    s_mov_b32 s10, exec_lo
128; GFX1032-NEXT:    ; implicit-def: $vgpr1
129; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
130; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
131; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
132; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
133; GFX1032-NEXT:  ; %bb.2:
134; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
135; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
136; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
137; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
138; GFX1032-NEXT:  .LBB0_3:
139; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
140; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
141; GFX1032-NEXT:    s_waitcnt vmcnt(0)
142; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
143; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
144; GFX1032-NEXT:  .LBB0_4: ; %Flow
145; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
146; GFX1032-NEXT:    s_wqm_b32 s4, -1
147; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
148; GFX1032-NEXT:    s_cbranch_vccnz .LBB0_6
149; GFX1032-NEXT:  ; %bb.5: ; %if
150; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
151; GFX1032-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
152; GFX1032-NEXT:    s_endpgm
153;
154; GFX1164-LABEL: add_i32_constant:
155; GFX1164:       ; %bb.0: ; %entry
156; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
157; GFX1164-NEXT:    ; implicit-def: $vgpr0
158; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
159; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
160; GFX1164-NEXT:    s_cbranch_execz .LBB0_4
161; GFX1164-NEXT:  ; %bb.1:
162; GFX1164-NEXT:    s_mov_b64 s[12:13], exec
163; GFX1164-NEXT:    s_mov_b64 s[10:11], exec
164; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
165; GFX1164-NEXT:    ; implicit-def: $vgpr1
166; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
167; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
168; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
169; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
170; GFX1164-NEXT:  ; %bb.2:
171; GFX1164-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
172; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
173; GFX1164-NEXT:    s_mul_i32 s12, s12, 5
174; GFX1164-NEXT:    v_mov_b32_e32 v1, s12
175; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
176; GFX1164-NEXT:  .LBB0_3:
177; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
178; GFX1164-NEXT:    s_waitcnt vmcnt(0)
179; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
180; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
181; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
182; GFX1164-NEXT:  .LBB0_4: ; %Flow
183; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
184; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
185; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
186; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
187; GFX1164-NEXT:    s_cbranch_vccnz .LBB0_6
188; GFX1164-NEXT:  ; %bb.5: ; %if
189; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
190; GFX1164-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
191; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
192; GFX1164-NEXT:    s_endpgm
193;
194; GFX1132-LABEL: add_i32_constant:
195; GFX1132:       ; %bb.0: ; %entry
196; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
197; GFX1132-NEXT:    ; implicit-def: $vgpr0
198; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
199; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
200; GFX1132-NEXT:    s_cbranch_execz .LBB0_4
201; GFX1132-NEXT:  ; %bb.1:
202; GFX1132-NEXT:    s_mov_b32 s10, exec_lo
203; GFX1132-NEXT:    s_mov_b32 s9, exec_lo
204; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
205; GFX1132-NEXT:    ; implicit-def: $vgpr1
206; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
207; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
208; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
209; GFX1132-NEXT:  ; %bb.2:
210; GFX1132-NEXT:    s_bcnt1_i32_b32 s10, s10
211; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
212; GFX1132-NEXT:    s_mul_i32 s10, s10, 5
213; GFX1132-NEXT:    v_mov_b32_e32 v1, s10
214; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
215; GFX1132-NEXT:  .LBB0_3:
216; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
217; GFX1132-NEXT:    s_waitcnt vmcnt(0)
218; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
219; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
220; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
221; GFX1132-NEXT:  .LBB0_4: ; %Flow
222; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
223; GFX1132-NEXT:    s_wqm_b32 s4, -1
224; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
225; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
226; GFX1132-NEXT:    s_cbranch_vccnz .LBB0_6
227; GFX1132-NEXT:  ; %bb.5: ; %if
228; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
229; GFX1132-NEXT:  .LBB0_6: ; %UnifiedReturnBlock
230; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
231; GFX1132-NEXT:    s_endpgm
232entry:
233  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
234  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
235  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
236  %cond = and i1 %cond1, %cond2
237  br i1 %cond, label %if, label %else
238if:
239  %bitcast = bitcast i32 %old to float
240  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
241  ret void
242else:
243  ret void
244}
245
246define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
247; GFX7-LABEL: add_i32_varying:
248; GFX7:       ; %bb.0: ; %entry
249; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
250; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
251; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
252; GFX7-NEXT:    s_cbranch_vccnz .LBB1_2
253; GFX7-NEXT:  ; %bb.1: ; %if
254; GFX7-NEXT:    s_waitcnt vmcnt(0)
255; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
256; GFX7-NEXT:  .LBB1_2: ; %else
257; GFX7-NEXT:    s_endpgm
258;
259; GFX8-LABEL: add_i32_varying:
260; GFX8:       ; %bb.0: ; %entry
261; GFX8-NEXT:    s_mov_b64 s[8:9], exec
262; GFX8-NEXT:    s_mov_b64 s[10:11], s[8:9]
263; GFX8-NEXT:    ; implicit-def: $vgpr3
264; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
265; GFX8-NEXT:    s_cbranch_execz .LBB1_4
266; GFX8-NEXT:  ; %bb.1:
267; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
268; GFX8-NEXT:    v_mov_b32_e32 v1, 0
269; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
270; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
271; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
272; GFX8-NEXT:    v_mov_b32_e32 v2, v0
273; GFX8-NEXT:    s_not_b64 exec, exec
274; GFX8-NEXT:    v_mov_b32_e32 v2, 0
275; GFX8-NEXT:    s_not_b64 exec, exec
276; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
277; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
278; GFX8-NEXT:    s_nop 1
279; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
280; GFX8-NEXT:    s_nop 1
281; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
282; GFX8-NEXT:    s_nop 1
283; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
284; GFX8-NEXT:    s_nop 1
285; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
286; GFX8-NEXT:    s_nop 1
287; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
288; GFX8-NEXT:    v_readlane_b32 s12, v2, 63
289; GFX8-NEXT:    s_nop 0
290; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
291; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
292; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
293; GFX8-NEXT:    ; implicit-def: $vgpr0
294; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
295; GFX8-NEXT:    s_cbranch_execz .LBB1_3
296; GFX8-NEXT:  ; %bb.2:
297; GFX8-NEXT:    v_mov_b32_e32 v0, s12
298; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
299; GFX8-NEXT:  .LBB1_3:
300; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
301; GFX8-NEXT:    s_waitcnt vmcnt(0)
302; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
303; GFX8-NEXT:    v_mov_b32_e32 v0, v1
304; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v0
305; GFX8-NEXT:  .LBB1_4: ; %Flow
306; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
307; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
308; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
309; GFX8-NEXT:    s_cbranch_vccnz .LBB1_6
310; GFX8-NEXT:  ; %bb.5: ; %if
311; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], 0
312; GFX8-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
313; GFX8-NEXT:    s_endpgm
314;
315; GFX9-LABEL: add_i32_varying:
316; GFX9:       ; %bb.0: ; %entry
317; GFX9-NEXT:    s_mov_b64 s[8:9], exec
318; GFX9-NEXT:    s_mov_b64 s[10:11], s[8:9]
319; GFX9-NEXT:    ; implicit-def: $vgpr3
320; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
321; GFX9-NEXT:    s_cbranch_execz .LBB1_4
322; GFX9-NEXT:  ; %bb.1:
323; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
324; GFX9-NEXT:    v_mov_b32_e32 v1, 0
325; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
326; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
327; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
328; GFX9-NEXT:    v_mov_b32_e32 v2, v0
329; GFX9-NEXT:    s_not_b64 exec, exec
330; GFX9-NEXT:    v_mov_b32_e32 v2, 0
331; GFX9-NEXT:    s_not_b64 exec, exec
332; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
333; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
334; GFX9-NEXT:    s_nop 1
335; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
336; GFX9-NEXT:    s_nop 1
337; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
338; GFX9-NEXT:    s_nop 1
339; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
340; GFX9-NEXT:    s_nop 1
341; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
342; GFX9-NEXT:    s_nop 1
343; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
344; GFX9-NEXT:    v_readlane_b32 s12, v2, 63
345; GFX9-NEXT:    s_nop 0
346; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
347; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
348; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
349; GFX9-NEXT:    ; implicit-def: $vgpr0
350; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
351; GFX9-NEXT:    s_cbranch_execz .LBB1_3
352; GFX9-NEXT:  ; %bb.2:
353; GFX9-NEXT:    v_mov_b32_e32 v0, s12
354; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
355; GFX9-NEXT:  .LBB1_3:
356; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
357; GFX9-NEXT:    s_waitcnt vmcnt(0)
358; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
359; GFX9-NEXT:    v_mov_b32_e32 v0, v1
360; GFX9-NEXT:    v_add_u32_e32 v3, s4, v0
361; GFX9-NEXT:  .LBB1_4: ; %Flow
362; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
363; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
364; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
365; GFX9-NEXT:    s_cbranch_vccnz .LBB1_6
366; GFX9-NEXT:  ; %bb.5: ; %if
367; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], 0
368; GFX9-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
369; GFX9-NEXT:    s_endpgm
370;
371; GFX1064-LABEL: add_i32_varying:
372; GFX1064:       ; %bb.0: ; %entry
373; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
374; GFX1064-NEXT:    ; implicit-def: $vgpr4
375; GFX1064-NEXT:    s_mov_b64 s[10:11], s[8:9]
376; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
377; GFX1064-NEXT:    s_cbranch_execz .LBB1_4
378; GFX1064-NEXT:  ; %bb.1:
379; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
380; GFX1064-NEXT:    s_not_b64 exec, exec
381; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
382; GFX1064-NEXT:    s_not_b64 exec, exec
383; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
384; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
385; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
386; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
387; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
388; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
389; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
390; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
391; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
392; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
393; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
394; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
395; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
396; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
397; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
398; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
399; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
400; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
401; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
402; GFX1064-NEXT:    v_readlane_b32 s12, v1, 63
403; GFX1064-NEXT:    v_readlane_b32 s14, v1, 47
404; GFX1064-NEXT:    v_writelane_b32 v3, s13, 32
405; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
406; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
407; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
408; GFX1064-NEXT:    v_writelane_b32 v3, s14, 48
409; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
410; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
411; GFX1064-NEXT:    ; implicit-def: $vgpr0
412; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
413; GFX1064-NEXT:    s_cbranch_execz .LBB1_3
414; GFX1064-NEXT:  ; %bb.2:
415; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
416; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
417; GFX1064-NEXT:  .LBB1_3:
418; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
419; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
420; GFX1064-NEXT:    s_waitcnt vmcnt(0)
421; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
422; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
423; GFX1064-NEXT:    v_add_nc_u32_e32 v4, s4, v0
424; GFX1064-NEXT:  .LBB1_4: ; %Flow
425; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
426; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
427; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
428; GFX1064-NEXT:    s_cbranch_vccnz .LBB1_6
429; GFX1064-NEXT:  ; %bb.5: ; %if
430; GFX1064-NEXT:    buffer_store_dword v4, off, s[0:3], 0
431; GFX1064-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
432; GFX1064-NEXT:    s_endpgm
433;
434; GFX1032-LABEL: add_i32_varying:
435; GFX1032:       ; %bb.0: ; %entry
436; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
437; GFX1032-NEXT:    ; implicit-def: $vgpr4
438; GFX1032-NEXT:    s_mov_b32 s9, s8
439; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
440; GFX1032-NEXT:    s_cbranch_execz .LBB1_4
441; GFX1032-NEXT:  ; %bb.1:
442; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
443; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
444; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
445; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
446; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
447; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
448; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
449; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
450; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
451; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
452; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
453; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
454; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
455; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
456; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
457; GFX1032-NEXT:    v_readlane_b32 s10, v1, 15
458; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
459; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
460; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
461; GFX1032-NEXT:    v_writelane_b32 v3, s10, 16
462; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
463; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
464; GFX1032-NEXT:    ; implicit-def: $vgpr0
465; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
466; GFX1032-NEXT:    s_cbranch_execz .LBB1_3
467; GFX1032-NEXT:  ; %bb.2:
468; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
469; GFX1032-NEXT:    s_mov_b32 s10, s11
470; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
471; GFX1032-NEXT:  .LBB1_3:
472; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
473; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
474; GFX1032-NEXT:    s_waitcnt vmcnt(0)
475; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
476; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
477; GFX1032-NEXT:    v_add_nc_u32_e32 v4, s4, v0
478; GFX1032-NEXT:  .LBB1_4: ; %Flow
479; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
480; GFX1032-NEXT:    s_wqm_b32 s4, -1
481; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
482; GFX1032-NEXT:    s_cbranch_vccnz .LBB1_6
483; GFX1032-NEXT:  ; %bb.5: ; %if
484; GFX1032-NEXT:    buffer_store_dword v4, off, s[0:3], 0
485; GFX1032-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
486; GFX1032-NEXT:    s_endpgm
487;
488; GFX1164-LABEL: add_i32_varying:
489; GFX1164:       ; %bb.0: ; %entry
490; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
491; GFX1164-NEXT:    ; implicit-def: $vgpr4
492; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
493; GFX1164-NEXT:    s_mov_b64 s[10:11], s[8:9]
494; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
495; GFX1164-NEXT:    s_cbranch_execz .LBB1_4
496; GFX1164-NEXT:  ; %bb.1:
497; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
498; GFX1164-NEXT:    s_not_b64 exec, exec
499; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
500; GFX1164-NEXT:    s_not_b64 exec, exec
501; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
502; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
503; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
505; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
506; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
507; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
508; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
509; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
510; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
511; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
512; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
513; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
514; GFX1164-NEXT:    v_readlane_b32 s12, v1, 31
515; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
516; GFX1164-NEXT:    v_mov_b32_e32 v2, s12
517; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
518; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
519; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
520; GFX1164-NEXT:    v_readlane_b32 s12, v1, 15
521; GFX1164-NEXT:    v_readlane_b32 s13, v1, 31
522; GFX1164-NEXT:    v_writelane_b32 v3, s12, 16
523; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
524; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
525; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
526; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
527; GFX1164-NEXT:    v_readlane_b32 s12, v1, 63
528; GFX1164-NEXT:    v_readlane_b32 s14, v1, 47
529; GFX1164-NEXT:    v_writelane_b32 v3, s13, 32
530; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
531; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
532; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
533; GFX1164-NEXT:    s_or_saveexec_b64 s[10:11], -1
534; GFX1164-NEXT:    v_writelane_b32 v3, s14, 48
535; GFX1164-NEXT:    s_mov_b64 exec, s[10:11]
536; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
537; GFX1164-NEXT:    ; implicit-def: $vgpr0
538; GFX1164-NEXT:    s_and_saveexec_b64 s[10:11], vcc
539; GFX1164-NEXT:    s_cbranch_execz .LBB1_3
540; GFX1164-NEXT:  ; %bb.2:
541; GFX1164-NEXT:    v_mov_b32_e32 v0, s12
542; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
543; GFX1164-NEXT:  .LBB1_3:
544; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
545; GFX1164-NEXT:    s_waitcnt vmcnt(0)
546; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
547; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
548; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
549; GFX1164-NEXT:    v_add_nc_u32_e32 v4, s4, v0
550; GFX1164-NEXT:  .LBB1_4: ; %Flow
551; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
552; GFX1164-NEXT:    s_wqm_b64 s[4:5], -1
553; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
554; GFX1164-NEXT:    s_and_not1_b64 vcc, exec, s[4:5]
555; GFX1164-NEXT:    s_cbranch_vccnz .LBB1_6
556; GFX1164-NEXT:  ; %bb.5: ; %if
557; GFX1164-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
558; GFX1164-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
559; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
560; GFX1164-NEXT:    s_endpgm
561;
562; GFX1132-LABEL: add_i32_varying:
563; GFX1132:       ; %bb.0: ; %entry
564; GFX1132-NEXT:    s_mov_b32 s8, exec_lo
565; GFX1132-NEXT:    ; implicit-def: $vgpr4
566; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
567; GFX1132-NEXT:    s_mov_b32 s9, s8
568; GFX1132-NEXT:    s_and_saveexec_b32 s8, s9
569; GFX1132-NEXT:    s_cbranch_execz .LBB1_4
570; GFX1132-NEXT:  ; %bb.1:
571; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
572; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
573; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
574; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
575; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
576; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
577; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
578; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
579; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
580; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
581; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
582; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
583; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
584; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
585; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
586; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
587; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
588; GFX1132-NEXT:    v_readlane_b32 s11, v1, 31
589; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
590; GFX1132-NEXT:    v_readlane_b32 s10, v1, 15
591; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
592; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
593; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
594; GFX1132-NEXT:    s_or_saveexec_b32 s9, -1
595; GFX1132-NEXT:    v_writelane_b32 v3, s10, 16
596; GFX1132-NEXT:    s_mov_b32 exec_lo, s9
597; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
598; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
599; GFX1132-NEXT:    ; implicit-def: $vgpr0
600; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
601; GFX1132-NEXT:    s_cbranch_execz .LBB1_3
602; GFX1132-NEXT:  ; %bb.2:
603; GFX1132-NEXT:    v_mov_b32_e32 v0, s11
604; GFX1132-NEXT:    s_mov_b32 s10, s11
605; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
606; GFX1132-NEXT:  .LBB1_3:
607; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
608; GFX1132-NEXT:    s_waitcnt vmcnt(0)
609; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
610; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
611; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
612; GFX1132-NEXT:    v_add_nc_u32_e32 v4, s4, v0
613; GFX1132-NEXT:  .LBB1_4: ; %Flow
614; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
615; GFX1132-NEXT:    s_wqm_b32 s4, -1
616; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
617; GFX1132-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s4
618; GFX1132-NEXT:    s_cbranch_vccnz .LBB1_6
619; GFX1132-NEXT:  ; %bb.5: ; %if
620; GFX1132-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
621; GFX1132-NEXT:  .LBB1_6: ; %UnifiedReturnBlock
622; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
623; GFX1132-NEXT:    s_endpgm
624entry:
625  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
626  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0)
627  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
628  %cond = and i1 %cond1, %cond2
629  br i1 %cond, label %if, label %else
630if:
631  %bitcast = bitcast i32 %old to float
632  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
633  ret void
634else:
635  ret void
636}
637