1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
3; RUN: llc  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
6; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
7
8declare i1 @llvm.amdgcn.wqm.vote(i1)
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
10declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg)
11
12; Show what the atomic optimization pass will do for raw buffers.
13
14define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %inout) {
15; GFX7-LABEL: add_i32_constant:
16; GFX7:       ; %bb.0: ; %entry
17; GFX7-NEXT:    s_mov_b64 s[10:11], exec
18; GFX7-NEXT:    ; implicit-def: $vgpr0
19; GFX7-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
20; GFX7-NEXT:    s_cbranch_execz BB0_4
21; GFX7-NEXT:  ; %bb.1:
22; GFX7-NEXT:    s_mov_b64 s[12:13], exec
23; GFX7-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
24; GFX7-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s13, v0
25; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
26; GFX7-NEXT:    ; implicit-def: $vgpr1
27; GFX7-NEXT:    s_and_saveexec_b64 s[10:11], vcc
28; GFX7-NEXT:    s_cbranch_execz BB0_3
29; GFX7-NEXT:  ; %bb.2:
30; GFX7-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
31; GFX7-NEXT:    s_mul_i32 s12, s12, 5
32; GFX7-NEXT:    v_mov_b32_e32 v1, s12
33; GFX7-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
34; GFX7-NEXT:  BB0_3:
35; GFX7-NEXT:    s_or_b64 exec, exec, s[10:11]
36; GFX7-NEXT:    s_waitcnt vmcnt(0)
37; GFX7-NEXT:    v_readfirstlane_b32 s4, v1
38; GFX7-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
39; GFX7-NEXT:  BB0_4: ; %Flow
40; GFX7-NEXT:    s_or_b64 exec, exec, s[8:9]
41; GFX7-NEXT:    s_wqm_b64 s[4:5], -1
42; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
43; GFX7-NEXT:    s_cbranch_vccnz BB0_6
44; GFX7-NEXT:  ; %bb.5: ; %if
45; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX7-NEXT:  BB0_6: ; %UnifiedReturnBlock
47; GFX7-NEXT:    s_endpgm
48;
49; GFX8-LABEL: add_i32_constant:
50; GFX8:       ; %bb.0: ; %entry
51; GFX8-NEXT:    s_mov_b64 s[10:11], exec
52; GFX8-NEXT:    ; implicit-def: $vgpr0
53; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
54; GFX8-NEXT:    s_cbranch_execz BB0_4
55; GFX8-NEXT:  ; %bb.1:
56; GFX8-NEXT:    s_mov_b64 s[12:13], exec
57; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
58; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
59; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
60; GFX8-NEXT:    ; implicit-def: $vgpr1
61; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
62; GFX8-NEXT:    s_cbranch_execz BB0_3
63; GFX8-NEXT:  ; %bb.2:
64; GFX8-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
65; GFX8-NEXT:    s_mul_i32 s12, s12, 5
66; GFX8-NEXT:    v_mov_b32_e32 v1, s12
67; GFX8-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
68; GFX8-NEXT:  BB0_3:
69; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
70; GFX8-NEXT:    s_waitcnt vmcnt(0)
71; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
72; GFX8-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
73; GFX8-NEXT:  BB0_4: ; %Flow
74; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
75; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
76; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
77; GFX8-NEXT:    s_cbranch_vccnz BB0_6
78; GFX8-NEXT:  ; %bb.5: ; %if
79; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
80; GFX8-NEXT:  BB0_6: ; %UnifiedReturnBlock
81; GFX8-NEXT:    s_endpgm
82;
83; GFX9-LABEL: add_i32_constant:
84; GFX9:       ; %bb.0: ; %entry
85; GFX9-NEXT:    s_mov_b64 s[10:11], exec
86; GFX9-NEXT:    ; implicit-def: $vgpr0
87; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
88; GFX9-NEXT:    s_cbranch_execz BB0_4
89; GFX9-NEXT:  ; %bb.1:
90; GFX9-NEXT:    s_mov_b64 s[12:13], exec
91; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s12, 0
92; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
93; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
94; GFX9-NEXT:    ; implicit-def: $vgpr1
95; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
96; GFX9-NEXT:    s_cbranch_execz BB0_3
97; GFX9-NEXT:  ; %bb.2:
98; GFX9-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
99; GFX9-NEXT:    s_mul_i32 s12, s12, 5
100; GFX9-NEXT:    v_mov_b32_e32 v1, s12
101; GFX9-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
102; GFX9-NEXT:  BB0_3:
103; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
104; GFX9-NEXT:    s_waitcnt vmcnt(0)
105; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
106; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
107; GFX9-NEXT:  BB0_4: ; %Flow
108; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
109; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
110; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
111; GFX9-NEXT:    s_cbranch_vccnz BB0_6
112; GFX9-NEXT:  ; %bb.5: ; %if
113; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
114; GFX9-NEXT:  BB0_6: ; %UnifiedReturnBlock
115; GFX9-NEXT:    s_endpgm
116;
117; GFX1064-LABEL: add_i32_constant:
118; GFX1064:       ; %bb.0: ; %entry
119; GFX1064-NEXT:    s_mov_b64 s[10:11], exec
120; GFX1064-NEXT:    ; implicit-def: $vgpr0
121; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
122; GFX1064-NEXT:    s_cbranch_execz BB0_4
123; GFX1064-NEXT:  ; %bb.1:
124; GFX1064-NEXT:    s_mov_b64 s[12:13], exec
125; GFX1064-NEXT:    ; implicit-def: $vgpr1
126; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s12, 0
127; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s13, v0
128; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
129; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
130; GFX1064-NEXT:    s_cbranch_execz BB0_3
131; GFX1064-NEXT:  ; %bb.2:
132; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
133; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
134; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
135; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
136; GFX1064-NEXT:  BB0_3:
137; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
138; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
139; GFX1064-NEXT:    s_waitcnt vmcnt(0)
140; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
141; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
142; GFX1064-NEXT:  BB0_4: ; %Flow
143; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
144; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
145; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
146; GFX1064-NEXT:    s_cbranch_vccnz BB0_6
147; GFX1064-NEXT:  ; %bb.5: ; %if
148; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
149; GFX1064-NEXT:  BB0_6: ; %UnifiedReturnBlock
150; GFX1064-NEXT:    s_endpgm
151;
152; GFX1032-LABEL: add_i32_constant:
153; GFX1032:       ; %bb.0: ; %entry
154; GFX1032-NEXT:    s_mov_b32 s9, exec_lo
155; GFX1032-NEXT:    ; implicit-def: $vgpr0
156; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
157; GFX1032-NEXT:    s_cbranch_execz BB0_4
158; GFX1032-NEXT:  ; %bb.1:
159; GFX1032-NEXT:    s_mov_b32 s10, exec_lo
160; GFX1032-NEXT:    ; implicit-def: $vgpr1
161; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s10, 0
162; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
163; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
164; GFX1032-NEXT:    s_cbranch_execz BB0_3
165; GFX1032-NEXT:  ; %bb.2:
166; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
167; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
168; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
169; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
170; GFX1032-NEXT:  BB0_3:
171; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
172; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
173; GFX1032-NEXT:    s_waitcnt vmcnt(0)
174; GFX1032-NEXT:    v_readfirstlane_b32 s4, v1
175; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
176; GFX1032-NEXT:  BB0_4: ; %Flow
177; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
178; GFX1032-NEXT:    s_wqm_b32 s4, -1
179; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
180; GFX1032-NEXT:    s_cbranch_vccnz BB0_6
181; GFX1032-NEXT:  ; %bb.5: ; %if
182; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
183; GFX1032-NEXT:  BB0_6: ; %UnifiedReturnBlock
184; GFX1032-NEXT:    s_endpgm
185entry:
186  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
187  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
188  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
189  %cond = and i1 %cond1, %cond2
190  br i1 %cond, label %if, label %else
191if:
192  %bitcast = bitcast i32 %old to float
193  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
194  ret void
195else:
196  ret void
197}
198
199define amdgpu_ps void @add_i32_varying(<4 x i32> inreg %out, <4 x i32> inreg %inout, i32 %val) {
200; GFX7-LABEL: add_i32_varying:
201; GFX7:       ; %bb.0: ; %entry
202; GFX7-NEXT:    s_wqm_b64 s[8:9], -1
203; GFX7-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
204; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
205; GFX7-NEXT:    s_cbranch_vccnz BB1_2
206; GFX7-NEXT:  ; %bb.1: ; %if
207; GFX7-NEXT:    s_waitcnt vmcnt(0)
208; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
209; GFX7-NEXT:  BB1_2: ; %else
210; GFX7-NEXT:    s_endpgm
211;
212; GFX8-LABEL: add_i32_varying:
213; GFX8:       ; %bb.0: ; %entry
214; GFX8-NEXT:    s_mov_b64 s[8:9], exec
215; GFX8-NEXT:    s_mov_b64 s[10:11], s[8:9]
216; GFX8-NEXT:    v_mov_b32_e32 v2, v0
217; GFX8-NEXT:    ; implicit-def: $vgpr0
218; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
219; GFX8-NEXT:    s_cbranch_execz BB1_4
220; GFX8-NEXT:  ; %bb.1:
221; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
222; GFX8-NEXT:    v_mov_b32_e32 v1, 0
223; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
224; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
225; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
226; GFX8-NEXT:    s_not_b64 exec, exec
227; GFX8-NEXT:    v_mov_b32_e32 v2, 0
228; GFX8-NEXT:    s_not_b64 exec, exec
229; GFX8-NEXT:    s_or_saveexec_b64 s[10:11], -1
230; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
231; GFX8-NEXT:    s_nop 1
232; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
233; GFX8-NEXT:    s_nop 1
234; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
235; GFX8-NEXT:    s_nop 1
236; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
237; GFX8-NEXT:    s_nop 1
238; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
239; GFX8-NEXT:    s_nop 1
240; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
241; GFX8-NEXT:    v_readlane_b32 s12, v2, 63
242; GFX8-NEXT:    s_nop 0
243; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
244; GFX8-NEXT:    s_mov_b64 exec, s[10:11]
245; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
246; GFX8-NEXT:    ; implicit-def: $vgpr0
247; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
248; GFX8-NEXT:    s_cbranch_execz BB1_3
249; GFX8-NEXT:  ; %bb.2:
250; GFX8-NEXT:    v_mov_b32_e32 v0, s12
251; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
252; GFX8-NEXT:  BB1_3:
253; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
254; GFX8-NEXT:    s_waitcnt vmcnt(0)
255; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
256; GFX8-NEXT:    v_mov_b32_e32 v0, v1
257; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
258; GFX8-NEXT:  BB1_4: ; %Flow
259; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
260; GFX8-NEXT:    s_wqm_b64 s[4:5], -1
261; GFX8-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
262; GFX8-NEXT:    s_cbranch_vccnz BB1_6
263; GFX8-NEXT:  ; %bb.5: ; %if
264; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
265; GFX8-NEXT:  BB1_6: ; %UnifiedReturnBlock
266; GFX8-NEXT:    s_endpgm
267;
268; GFX9-LABEL: add_i32_varying:
269; GFX9:       ; %bb.0: ; %entry
270; GFX9-NEXT:    s_mov_b64 s[8:9], exec
271; GFX9-NEXT:    s_mov_b64 s[10:11], s[8:9]
272; GFX9-NEXT:    v_mov_b32_e32 v2, v0
273; GFX9-NEXT:    ; implicit-def: $vgpr0
274; GFX9-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
275; GFX9-NEXT:    s_cbranch_execz BB1_4
276; GFX9-NEXT:  ; %bb.1:
277; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
278; GFX9-NEXT:    v_mov_b32_e32 v1, 0
279; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
280; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
281; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
282; GFX9-NEXT:    s_not_b64 exec, exec
283; GFX9-NEXT:    v_mov_b32_e32 v2, 0
284; GFX9-NEXT:    s_not_b64 exec, exec
285; GFX9-NEXT:    s_or_saveexec_b64 s[10:11], -1
286; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
287; GFX9-NEXT:    s_nop 1
288; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
289; GFX9-NEXT:    s_nop 1
290; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
291; GFX9-NEXT:    s_nop 1
292; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
293; GFX9-NEXT:    s_nop 1
294; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
295; GFX9-NEXT:    s_nop 1
296; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
297; GFX9-NEXT:    v_readlane_b32 s12, v2, 63
298; GFX9-NEXT:    s_nop 0
299; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
300; GFX9-NEXT:    s_mov_b64 exec, s[10:11]
301; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
302; GFX9-NEXT:    ; implicit-def: $vgpr0
303; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
304; GFX9-NEXT:    s_cbranch_execz BB1_3
305; GFX9-NEXT:  ; %bb.2:
306; GFX9-NEXT:    v_mov_b32_e32 v0, s12
307; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
308; GFX9-NEXT:  BB1_3:
309; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
310; GFX9-NEXT:    s_waitcnt vmcnt(0)
311; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
312; GFX9-NEXT:    v_mov_b32_e32 v0, v1
313; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
314; GFX9-NEXT:  BB1_4: ; %Flow
315; GFX9-NEXT:    s_or_b64 exec, exec, s[8:9]
316; GFX9-NEXT:    s_wqm_b64 s[4:5], -1
317; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
318; GFX9-NEXT:    s_cbranch_vccnz BB1_6
319; GFX9-NEXT:  ; %bb.5: ; %if
320; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
321; GFX9-NEXT:  BB1_6: ; %UnifiedReturnBlock
322; GFX9-NEXT:    s_endpgm
323;
324; GFX1064-LABEL: add_i32_varying:
325; GFX1064:       ; %bb.0: ; %entry
326; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
327; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
328; GFX1064-NEXT:    s_mov_b64 s[10:11], s[8:9]
329; GFX1064-NEXT:    ; implicit-def: $vgpr0
330; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], s[10:11]
331; GFX1064-NEXT:    s_cbranch_execz BB1_4
332; GFX1064-NEXT:  ; %bb.1:
333; GFX1064-NEXT:    s_not_b64 exec, exec
334; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
335; GFX1064-NEXT:    s_not_b64 exec, exec
336; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
337; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
338; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
339; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
340; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
341; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
342; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
343; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
344; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
345; GFX1064-NEXT:    v_readlane_b32 s12, v1, 31
346; GFX1064-NEXT:    v_mov_b32_e32 v2, s12
347; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
348; GFX1064-NEXT:    v_readlane_b32 s12, v1, 15
349; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
350; GFX1064-NEXT:    v_readlane_b32 s13, v1, 31
351; GFX1064-NEXT:    v_writelane_b32 v3, s12, 16
352; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
353; GFX1064-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
354; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
355; GFX1064-NEXT:    v_readlane_b32 s12, v1, 63
356; GFX1064-NEXT:    v_readlane_b32 s14, v1, 47
357; GFX1064-NEXT:    v_writelane_b32 v3, s13, 32
358; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
359; GFX1064-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
360; GFX1064-NEXT:    s_or_saveexec_b64 s[10:11], -1
361; GFX1064-NEXT:    v_writelane_b32 v3, s14, 48
362; GFX1064-NEXT:    s_mov_b64 exec, s[10:11]
363; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
364; GFX1064-NEXT:    ; implicit-def: $vgpr0
365; GFX1064-NEXT:    s_and_saveexec_b64 s[28:29], vcc
366; GFX1064-NEXT:    s_cbranch_execz BB1_3
367; GFX1064-NEXT:  ; %bb.2:
368; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
369; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
370; GFX1064-NEXT:  BB1_3:
371; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
372; GFX1064-NEXT:    s_or_b64 exec, exec, s[28:29]
373; GFX1064-NEXT:    s_waitcnt vmcnt(0)
374; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
375; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
376; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s4, v0
377; GFX1064-NEXT:  BB1_4: ; %Flow
378; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
379; GFX1064-NEXT:    s_wqm_b64 s[4:5], -1
380; GFX1064-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
381; GFX1064-NEXT:    s_cbranch_vccnz BB1_6
382; GFX1064-NEXT:  ; %bb.5: ; %if
383; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
384; GFX1064-NEXT:  BB1_6: ; %UnifiedReturnBlock
385; GFX1064-NEXT:    s_endpgm
386;
387; GFX1032-LABEL: add_i32_varying:
388; GFX1032:       ; %bb.0: ; %entry
389; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
390; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
391; GFX1032-NEXT:    s_mov_b32 s9, s8
392; GFX1032-NEXT:    ; implicit-def: $vgpr0
393; GFX1032-NEXT:    s_and_saveexec_b32 s8, s9
394; GFX1032-NEXT:    s_cbranch_execz BB1_4
395; GFX1032-NEXT:  ; %bb.1:
396; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
397; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
398; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
399; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
400; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
401; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
402; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
403; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
404; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
405; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
406; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
407; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
408; GFX1032-NEXT:    v_readlane_b32 s11, v1, 31
409; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
410; GFX1032-NEXT:    v_readlane_b32 s10, v1, 15
411; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
412; GFX1032-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
413; GFX1032-NEXT:    s_or_saveexec_b32 s9, -1
414; GFX1032-NEXT:    v_writelane_b32 v3, s10, 16
415; GFX1032-NEXT:    s_mov_b32 exec_lo, s9
416; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
417; GFX1032-NEXT:    ; implicit-def: $vgpr0
418; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
419; GFX1032-NEXT:    s_cbranch_execz BB1_3
420; GFX1032-NEXT:  ; %bb.2:
421; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
422; GFX1032-NEXT:    s_mov_b32 s10, s11
423; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
424; GFX1032-NEXT:  BB1_3:
425; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
426; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
427; GFX1032-NEXT:    s_waitcnt vmcnt(0)
428; GFX1032-NEXT:    v_readfirstlane_b32 s4, v0
429; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
430; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s4, v0
431; GFX1032-NEXT:  BB1_4: ; %Flow
432; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
433; GFX1032-NEXT:    s_wqm_b32 s4, -1
434; GFX1032-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s4
435; GFX1032-NEXT:    s_cbranch_vccnz BB1_6
436; GFX1032-NEXT:  ; %bb.5: ; %if
437; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
438; GFX1032-NEXT:  BB1_6: ; %UnifiedReturnBlock
439; GFX1032-NEXT:    s_endpgm
440entry:
441  %cond1 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
442  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %val, <4 x i32> %inout, i32 0, i32 0, i32 0)
443  %cond2 = call i1 @llvm.amdgcn.wqm.vote(i1 true)
444  %cond = and i1 %cond1, %cond2
445  br i1 %cond, label %if, label %else
446if:
447  %bitcast = bitcast i32 %old to float
448  call void @llvm.amdgcn.raw.buffer.store.f32(float %bitcast, <4 x i32> %out, i32 0, i32 0, i32 0)
449  ret void
450else:
451  ret void
452}
453