1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10; Show what the atomic optimization pass will do for global pointers.
11
12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
13; GFX7LESS-LABEL: add_i32_constant:
14; GFX7LESS:       ; %bb.0: ; %entry
15; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
16; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
17; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
18; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
19; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
20; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
21; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
22; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
23; GFX7LESS-NEXT:  ; %bb.1:
24; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
25; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
26; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
27; GFX7LESS-NEXT:    s_mov_b32 s10, -1
28; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7LESS-NEXT:    s_mov_b32 s8, s2
30; GFX7LESS-NEXT:    s_mov_b32 s9, s3
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
32; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
33; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
34; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
35; GFX7LESS-NEXT:    buffer_wbinvl1
36; GFX7LESS-NEXT:  .LBB0_2:
37; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
40; GFX7LESS-NEXT:    s_mov_b32 s2, -1
41; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
43; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
44; GFX7LESS-NEXT:    s_endpgm
45;
46; GFX89-LABEL: add_i32_constant:
47; GFX89:       ; %bb.0: ; %entry
48; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
49; GFX89-NEXT:    s_mov_b64 s[6:7], exec
50; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
51; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
52; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX89-NEXT:    ; implicit-def: $vgpr1
54; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; GFX89-NEXT:    s_cbranch_execz .LBB0_2
56; GFX89-NEXT:  ; %bb.1:
57; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX89-NEXT:    s_mov_b32 s8, s2
59; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
60; GFX89-NEXT:    s_mul_i32 s2, s2, 5
61; GFX89-NEXT:    s_mov_b32 s11, 0xf000
62; GFX89-NEXT:    s_mov_b32 s10, -1
63; GFX89-NEXT:    s_mov_b32 s9, s3
64; GFX89-NEXT:    v_mov_b32_e32 v1, s2
65; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
66; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
67; GFX89-NEXT:    s_waitcnt vmcnt(0)
68; GFX89-NEXT:    buffer_wbinvl1_vol
69; GFX89-NEXT:  .LBB0_2:
70; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
71; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
72; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX89-NEXT:    s_mov_b32 s3, 0xf000
74; GFX89-NEXT:    s_mov_b32 s2, -1
75; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
76; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX89-NEXT:    s_endpgm
78;
79; GFX1064-LABEL: add_i32_constant:
80; GFX1064:       ; %bb.0: ; %entry
81; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
82; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
83; GFX1064-NEXT:    ; implicit-def: $vgpr1
84; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
85; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
86; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
87; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
89; GFX1064-NEXT:  ; %bb.1:
90; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
91; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
92; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
93; GFX1064-NEXT:    s_mov_b32 s10, -1
94; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
95; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX1064-NEXT:    s_mov_b32 s8, s2
97; GFX1064-NEXT:    s_mov_b32 s9, s3
98; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
99; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
100; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
101; GFX1064-NEXT:    s_waitcnt vmcnt(0)
102; GFX1064-NEXT:    buffer_gl0_inv
103; GFX1064-NEXT:    buffer_gl1_inv
104; GFX1064-NEXT:  .LBB0_2:
105; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
106; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
107; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
108; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
109; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
110; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
111; GFX1064-NEXT:    s_mov_b32 s2, -1
112; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
113; GFX1064-NEXT:    s_endpgm
114;
115; GFX1032-LABEL: add_i32_constant:
116; GFX1032:       ; %bb.0: ; %entry
117; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
118; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
119; GFX1032-NEXT:    ; implicit-def: $vgpr1
120; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
121; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
122; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
123; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
124; GFX1032-NEXT:  ; %bb.1:
125; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
126; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
127; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
128; GFX1032-NEXT:    s_mov_b32 s10, -1
129; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
130; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX1032-NEXT:    s_mov_b32 s8, s2
132; GFX1032-NEXT:    s_mov_b32 s9, s3
133; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
134; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
135; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
136; GFX1032-NEXT:    s_waitcnt vmcnt(0)
137; GFX1032-NEXT:    buffer_gl0_inv
138; GFX1032-NEXT:    buffer_gl1_inv
139; GFX1032-NEXT:  .LBB0_2:
140; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
141; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
142; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
144; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
145; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
146; GFX1032-NEXT:    s_mov_b32 s2, -1
147; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
148; GFX1032-NEXT:    s_endpgm
149entry:
150  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
151  store i32 %old, i32 addrspace(1)* %out
152  ret void
153}
154
155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
156; GFX7LESS-LABEL: add_i32_uniform:
157; GFX7LESS:       ; %bb.0: ; %entry
158; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
159; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
160; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
161; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
162; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
163; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
164; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
165; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
166; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
167; GFX7LESS-NEXT:  ; %bb.1:
168; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
169; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
170; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
172; GFX7LESS-NEXT:    s_mov_b32 s14, -1
173; GFX7LESS-NEXT:    s_mov_b32 s12, s6
174; GFX7LESS-NEXT:    s_mov_b32 s13, s7
175; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
176; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
177; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
178; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
179; GFX7LESS-NEXT:    buffer_wbinvl1
180; GFX7LESS-NEXT:  .LBB1_2:
181; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
182; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
184; GFX7LESS-NEXT:    s_mov_b32 s6, -1
185; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
186; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
187; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
188; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
189; GFX7LESS-NEXT:    s_endpgm
190;
191; GFX8-LABEL: add_i32_uniform:
192; GFX8:       ; %bb.0: ; %entry
193; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
194; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
195; GFX8-NEXT:    s_mov_b64 s[2:3], exec
196; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
197; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
198; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
199; GFX8-NEXT:    ; implicit-def: $vgpr1
200; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
201; GFX8-NEXT:    s_cbranch_execz .LBB1_2
202; GFX8-NEXT:  ; %bb.1:
203; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
204; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX8-NEXT:    s_mul_i32 s2, s8, s2
206; GFX8-NEXT:    s_mov_b32 s15, 0xf000
207; GFX8-NEXT:    s_mov_b32 s14, -1
208; GFX8-NEXT:    s_mov_b32 s12, s6
209; GFX8-NEXT:    s_mov_b32 s13, s7
210; GFX8-NEXT:    v_mov_b32_e32 v1, s2
211; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
212; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
213; GFX8-NEXT:    s_waitcnt vmcnt(0)
214; GFX8-NEXT:    buffer_wbinvl1_vol
215; GFX8-NEXT:  .LBB1_2:
216; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
219; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
220; GFX8-NEXT:    s_mov_b32 s7, 0xf000
221; GFX8-NEXT:    s_mov_b32 s6, -1
222; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
223; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
224; GFX8-NEXT:    s_endpgm
225;
226; GFX9-LABEL: add_i32_uniform:
227; GFX9:       ; %bb.0: ; %entry
228; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
229; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
230; GFX9-NEXT:    s_mov_b64 s[2:3], exec
231; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
232; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
233; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
234; GFX9-NEXT:    ; implicit-def: $vgpr1
235; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
236; GFX9-NEXT:    s_cbranch_execz .LBB1_2
237; GFX9-NEXT:  ; %bb.1:
238; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX9-NEXT:    s_mul_i32 s2, s8, s2
241; GFX9-NEXT:    s_mov_b32 s15, 0xf000
242; GFX9-NEXT:    s_mov_b32 s14, -1
243; GFX9-NEXT:    s_mov_b32 s12, s6
244; GFX9-NEXT:    s_mov_b32 s13, s7
245; GFX9-NEXT:    v_mov_b32_e32 v1, s2
246; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
247; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
248; GFX9-NEXT:    s_waitcnt vmcnt(0)
249; GFX9-NEXT:    buffer_wbinvl1_vol
250; GFX9-NEXT:  .LBB1_2:
251; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
254; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
255; GFX9-NEXT:    s_mov_b32 s7, 0xf000
256; GFX9-NEXT:    s_mov_b32 s6, -1
257; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
258; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
259; GFX9-NEXT:    s_endpgm
260;
261; GFX1064-LABEL: add_i32_uniform:
262; GFX1064:       ; %bb.0: ; %entry
263; GFX1064-NEXT:    s_clause 0x1
264; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
265; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
266; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
267; GFX1064-NEXT:    ; implicit-def: $vgpr1
268; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
269; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
270; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
271; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
272; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
273; GFX1064-NEXT:  ; %bb.1:
274; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
275; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
276; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
277; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
278; GFX1064-NEXT:    s_mov_b32 s14, -1
279; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
280; GFX1064-NEXT:    s_mov_b32 s12, s6
281; GFX1064-NEXT:    s_mov_b32 s13, s7
282; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
283; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
284; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
285; GFX1064-NEXT:    s_waitcnt vmcnt(0)
286; GFX1064-NEXT:    buffer_gl0_inv
287; GFX1064-NEXT:    buffer_gl1_inv
288; GFX1064-NEXT:  .LBB1_2:
289; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
290; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
291; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
292; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
294; GFX1064-NEXT:    s_mov_b32 s6, -1
295; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1]
296; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
297; GFX1064-NEXT:    s_endpgm
298;
299; GFX1032-LABEL: add_i32_uniform:
300; GFX1032:       ; %bb.0: ; %entry
301; GFX1032-NEXT:    s_clause 0x1
302; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
303; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x34
304; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
305; GFX1032-NEXT:    ; implicit-def: $vgpr1
306; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
307; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
308; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
309; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
310; GFX1032-NEXT:  ; %bb.1:
311; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
312; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
313; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
314; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
315; GFX1032-NEXT:    s_mov_b32 s10, -1
316; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
317; GFX1032-NEXT:    s_mov_b32 s8, s6
318; GFX1032-NEXT:    s_mov_b32 s9, s7
319; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
320; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
321; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
322; GFX1032-NEXT:    s_waitcnt vmcnt(0)
323; GFX1032-NEXT:    buffer_gl0_inv
324; GFX1032-NEXT:    buffer_gl1_inv
325; GFX1032-NEXT:  .LBB1_2:
326; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
327; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
328; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
329; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
331; GFX1032-NEXT:    s_mov_b32 s6, -1
332; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v0, s[0:1]
333; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
334; GFX1032-NEXT:    s_endpgm
335entry:
336  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
337  store i32 %old, i32 addrspace(1)* %out
338  ret void
339}
340
341define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
342; GFX7LESS-LABEL: add_i32_varying:
343; GFX7LESS:       ; %bb.0: ; %entry
344; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
345; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
346; GFX7LESS-NEXT:    s_mov_b32 s6, -1
347; GFX7LESS-NEXT:    s_mov_b32 s10, s6
348; GFX7LESS-NEXT:    s_mov_b32 s11, s7
349; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
350; GFX7LESS-NEXT:    s_mov_b32 s8, s2
351; GFX7LESS-NEXT:    s_mov_b32 s9, s3
352; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
353; GFX7LESS-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
354; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
355; GFX7LESS-NEXT:    buffer_wbinvl1
356; GFX7LESS-NEXT:    s_mov_b32 s4, s0
357; GFX7LESS-NEXT:    s_mov_b32 s5, s1
358; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
359; GFX7LESS-NEXT:    s_endpgm
360;
361; GFX8-LABEL: add_i32_varying:
362; GFX8:       ; %bb.0: ; %entry
363; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
364; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
365; GFX8-NEXT:    v_mov_b32_e32 v1, 0
366; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
367; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
368; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
369; GFX8-NEXT:    v_mov_b32_e32 v2, v0
370; GFX8-NEXT:    s_not_b64 exec, exec
371; GFX8-NEXT:    v_mov_b32_e32 v2, 0
372; GFX8-NEXT:    s_not_b64 exec, exec
373; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
374; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
375; GFX8-NEXT:    s_nop 1
376; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
377; GFX8-NEXT:    s_nop 1
378; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
379; GFX8-NEXT:    s_nop 1
380; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
381; GFX8-NEXT:    s_nop 1
382; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
383; GFX8-NEXT:    s_nop 1
384; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
385; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
386; GFX8-NEXT:    s_nop 0
387; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
388; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
389; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
390; GFX8-NEXT:    ; implicit-def: $vgpr0
391; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
392; GFX8-NEXT:    s_cbranch_execz .LBB2_2
393; GFX8-NEXT:  ; %bb.1:
394; GFX8-NEXT:    s_mov_b32 s11, 0xf000
395; GFX8-NEXT:    s_mov_b32 s10, -1
396; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX8-NEXT:    s_mov_b32 s8, s2
398; GFX8-NEXT:    s_mov_b32 s9, s3
399; GFX8-NEXT:    v_mov_b32_e32 v0, s6
400; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
401; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
402; GFX8-NEXT:    s_waitcnt vmcnt(0)
403; GFX8-NEXT:    buffer_wbinvl1_vol
404; GFX8-NEXT:  .LBB2_2:
405; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
406; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
407; GFX8-NEXT:    v_mov_b32_e32 v0, v1
408; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX8-NEXT:    s_mov_b32 s3, 0xf000
410; GFX8-NEXT:    s_mov_b32 s2, -1
411; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
412; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
413; GFX8-NEXT:    s_endpgm
414;
415; GFX9-LABEL: add_i32_varying:
416; GFX9:       ; %bb.0: ; %entry
417; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
418; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
419; GFX9-NEXT:    v_mov_b32_e32 v1, 0
420; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
421; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
422; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
423; GFX9-NEXT:    v_mov_b32_e32 v2, v0
424; GFX9-NEXT:    s_not_b64 exec, exec
425; GFX9-NEXT:    v_mov_b32_e32 v2, 0
426; GFX9-NEXT:    s_not_b64 exec, exec
427; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
428; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
429; GFX9-NEXT:    s_nop 1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
435; GFX9-NEXT:    s_nop 1
436; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
437; GFX9-NEXT:    s_nop 1
438; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
439; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
440; GFX9-NEXT:    s_nop 0
441; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
442; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
443; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
444; GFX9-NEXT:    ; implicit-def: $vgpr0
445; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
446; GFX9-NEXT:    s_cbranch_execz .LBB2_2
447; GFX9-NEXT:  ; %bb.1:
448; GFX9-NEXT:    s_mov_b32 s11, 0xf000
449; GFX9-NEXT:    s_mov_b32 s10, -1
450; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX9-NEXT:    s_mov_b32 s8, s2
452; GFX9-NEXT:    s_mov_b32 s9, s3
453; GFX9-NEXT:    v_mov_b32_e32 v0, s6
454; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
455; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
456; GFX9-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-NEXT:    buffer_wbinvl1_vol
458; GFX9-NEXT:  .LBB2_2:
459; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
460; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
461; GFX9-NEXT:    v_mov_b32_e32 v0, v1
462; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
463; GFX9-NEXT:    s_mov_b32 s3, 0xf000
464; GFX9-NEXT:    s_mov_b32 s2, -1
465; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
466; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
467; GFX9-NEXT:    s_endpgm
468;
469; GFX1064-LABEL: add_i32_varying:
470; GFX1064:       ; %bb.0: ; %entry
471; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
472; GFX1064-NEXT:    s_not_b64 exec, exec
473; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
474; GFX1064-NEXT:    s_not_b64 exec, exec
475; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
476; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
477; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
478; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
479; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
480; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
481; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
482; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
483; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
484; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
485; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
486; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
487; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
488; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
489; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
490; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
491; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
492; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
493; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
494; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
495; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
496; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
497; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
498; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
499; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
500; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
501; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
502; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
503; GFX1064-NEXT:    s_mov_b32 s4, s9
504; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
505; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
506; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
507; GFX1064-NEXT:    s_mov_b32 s6, -1
508; GFX1064-NEXT:    ; implicit-def: $vgpr0
509; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
510; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
511; GFX1064-NEXT:  ; %bb.1:
512; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
513; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
514; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX1064-NEXT:    s_mov_b32 s4, s2
516; GFX1064-NEXT:    s_mov_b32 s5, s3
517; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
518; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
519; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
520; GFX1064-NEXT:    s_waitcnt vmcnt(0)
521; GFX1064-NEXT:    buffer_gl0_inv
522; GFX1064-NEXT:    buffer_gl1_inv
523; GFX1064-NEXT:  .LBB2_2:
524; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
525; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
526; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
528; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
529; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
530; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s2, v0
531; GFX1064-NEXT:    s_mov_b32 s2, s6
532; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
533; GFX1064-NEXT:    s_endpgm
534;
535; GFX1032-LABEL: add_i32_varying:
536; GFX1032:       ; %bb.0: ; %entry
537; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
538; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
539; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
540; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
541; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
542; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
543; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
544; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
545; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
546; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
547; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
548; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
549; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
550; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
551; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
552; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
553; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
554; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
555; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
556; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
557; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
558; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
559; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
560; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
561; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
562; GFX1032-NEXT:    s_mov_b32 s4, s6
563; GFX1032-NEXT:    s_mov_b32 s6, -1
564; GFX1032-NEXT:    ; implicit-def: $vgpr0
565; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
566; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
567; GFX1032-NEXT:  ; %bb.1:
568; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
569; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
570; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
571; GFX1032-NEXT:    s_mov_b32 s4, s2
572; GFX1032-NEXT:    s_mov_b32 s5, s3
573; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
574; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
575; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
576; GFX1032-NEXT:    s_waitcnt vmcnt(0)
577; GFX1032-NEXT:    buffer_gl0_inv
578; GFX1032-NEXT:    buffer_gl1_inv
579; GFX1032-NEXT:  .LBB2_2:
580; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
581; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
582; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
584; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
585; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
586; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s2, v0
587; GFX1032-NEXT:    s_mov_b32 s2, s6
588; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
589; GFX1032-NEXT:    s_endpgm
590entry:
591  %lane = call i32 @llvm.amdgcn.workitem.id.x()
592  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
593  store i32 %old, i32 addrspace(1)* %out
594  ret void
595}
596
597define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
598; GFX7LESS-LABEL: add_i64_constant:
599; GFX7LESS:       ; %bb.0: ; %entry
600; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
601; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
602; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
603; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
604; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
605; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
606; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
607; GFX7LESS-NEXT:    s_cbranch_execz .LBB3_2
608; GFX7LESS-NEXT:  ; %bb.1:
609; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
610; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
611; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
612; GFX7LESS-NEXT:    s_mov_b32 s10, -1
613; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX7LESS-NEXT:    s_mov_b32 s8, s2
615; GFX7LESS-NEXT:    s_mov_b32 s9, s3
616; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
617; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
618; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
619; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
620; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
621; GFX7LESS-NEXT:    buffer_wbinvl1
622; GFX7LESS-NEXT:  .LBB3_2:
623; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
624; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
626; GFX7LESS-NEXT:    s_mov_b32 s2, -1
627; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
628; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
629; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
630; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
631; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
632; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
633; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
634; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
635; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
636; GFX7LESS-NEXT:    s_endpgm
637;
638; GFX89-LABEL: add_i64_constant:
639; GFX89:       ; %bb.0: ; %entry
640; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
641; GFX89-NEXT:    s_mov_b64 s[6:7], exec
642; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
643; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
644; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
645; GFX89-NEXT:    ; implicit-def: $vgpr0_vgpr1
646; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
647; GFX89-NEXT:    s_cbranch_execz .LBB3_2
648; GFX89-NEXT:  ; %bb.1:
649; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX89-NEXT:    s_mov_b32 s8, s2
651; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
652; GFX89-NEXT:    s_mul_i32 s2, s2, 5
653; GFX89-NEXT:    s_mov_b32 s11, 0xf000
654; GFX89-NEXT:    s_mov_b32 s10, -1
655; GFX89-NEXT:    s_mov_b32 s9, s3
656; GFX89-NEXT:    v_mov_b32_e32 v0, s2
657; GFX89-NEXT:    v_mov_b32_e32 v1, 0
658; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
659; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
660; GFX89-NEXT:    s_waitcnt vmcnt(0)
661; GFX89-NEXT:    buffer_wbinvl1_vol
662; GFX89-NEXT:  .LBB3_2:
663; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
664; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX89-NEXT:    v_readfirstlane_b32 s2, v0
666; GFX89-NEXT:    v_readfirstlane_b32 s3, v1
667; GFX89-NEXT:    v_mov_b32_e32 v0, s2
668; GFX89-NEXT:    v_mov_b32_e32 v1, s3
669; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
670; GFX89-NEXT:    s_mov_b32 s3, 0xf000
671; GFX89-NEXT:    s_mov_b32 s2, -1
672; GFX89-NEXT:    s_nop 2
673; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
674; GFX89-NEXT:    s_endpgm
675;
676; GFX1064-LABEL: add_i64_constant:
677; GFX1064:       ; %bb.0: ; %entry
678; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
679; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
680; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
681; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
682; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
683; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
684; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
685; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
686; GFX1064-NEXT:  ; %bb.1:
687; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
688; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
689; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
690; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
691; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
692; GFX1064-NEXT:    s_mov_b32 s10, -1
693; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
694; GFX1064-NEXT:    s_mov_b32 s8, s2
695; GFX1064-NEXT:    s_mov_b32 s9, s3
696; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
697; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
698; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
699; GFX1064-NEXT:    s_waitcnt vmcnt(0)
700; GFX1064-NEXT:    buffer_gl0_inv
701; GFX1064-NEXT:    buffer_gl1_inv
702; GFX1064-NEXT:  .LBB3_2:
703; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
704; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
705; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
706; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
707; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
708; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
709; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
710; GFX1064-NEXT:    s_mov_b32 s2, -1
711; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
712; GFX1064-NEXT:    s_endpgm
713;
714; GFX1032-LABEL: add_i64_constant:
715; GFX1032:       ; %bb.0: ; %entry
716; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
717; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
718; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
719; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
720; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
721; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
722; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
723; GFX1032-NEXT:  ; %bb.1:
724; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
725; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
726; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
727; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
728; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
729; GFX1032-NEXT:    s_mov_b32 s10, -1
730; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX1032-NEXT:    s_mov_b32 s8, s2
732; GFX1032-NEXT:    s_mov_b32 s9, s3
733; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
734; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
735; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
736; GFX1032-NEXT:    s_waitcnt vmcnt(0)
737; GFX1032-NEXT:    buffer_gl0_inv
738; GFX1032-NEXT:    buffer_gl1_inv
739; GFX1032-NEXT:  .LBB3_2:
740; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
741; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
742; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
743; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
744; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
745; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
746; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
747; GFX1032-NEXT:    s_mov_b32 s2, -1
748; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
749; GFX1032-NEXT:    s_endpgm
750entry:
751  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
752  store i64 %old, i64 addrspace(1)* %out
753  ret void
754}
755
756define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
757; GFX7LESS-LABEL: add_i64_uniform:
758; GFX7LESS:       ; %bb.0: ; %entry
759; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
760; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
761; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
762; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
763; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
764; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
765; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
766; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
767; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
768; GFX7LESS-NEXT:  ; %bb.1:
769; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
770; GFX7LESS-NEXT:    s_mov_b32 s14, -1
771; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX7LESS-NEXT:    s_mov_b32 s12, s6
773; GFX7LESS-NEXT:    s_mov_b32 s13, s7
774; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
775; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
776; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
777; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
778; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
779; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
780; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
781; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
782; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
783; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
784; GFX7LESS-NEXT:    buffer_wbinvl1
785; GFX7LESS-NEXT:  .LBB4_2:
786; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
787; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
788; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
789; GFX7LESS-NEXT:    s_mov_b32 s6, -1
790; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
791; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
792; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
793; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
794; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
795; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
796; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
797; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
798; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
799; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
800; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
801; GFX7LESS-NEXT:    s_endpgm
802;
803; GFX8-LABEL: add_i64_uniform:
804; GFX8:       ; %bb.0: ; %entry
805; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
806; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
807; GFX8-NEXT:    s_mov_b64 s[8:9], exec
808; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
809; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
810; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
811; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
812; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
813; GFX8-NEXT:    s_cbranch_execz .LBB4_2
814; GFX8-NEXT:  ; %bb.1:
815; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX8-NEXT:    s_mov_b32 s12, s6
817; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
818; GFX8-NEXT:    v_mov_b32_e32 v0, s6
819; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
820; GFX8-NEXT:    s_mul_i32 s6, s1, s6
821; GFX8-NEXT:    s_mov_b32 s15, 0xf000
822; GFX8-NEXT:    s_mov_b32 s14, -1
823; GFX8-NEXT:    s_mov_b32 s13, s7
824; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
825; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
826; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
827; GFX8-NEXT:    s_waitcnt vmcnt(0)
828; GFX8-NEXT:    buffer_wbinvl1_vol
829; GFX8-NEXT:  .LBB4_2:
830; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
831; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
832; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
833; GFX8-NEXT:    v_mov_b32_e32 v0, s2
834; GFX8-NEXT:    v_mov_b32_e32 v1, s3
835; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v2
837; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
838; GFX8-NEXT:    s_mov_b32 s7, 0xf000
839; GFX8-NEXT:    s_mov_b32 s6, -1
840; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
841; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
842; GFX8-NEXT:    s_endpgm
843;
844; GFX9-LABEL: add_i64_uniform:
845; GFX9:       ; %bb.0: ; %entry
846; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
847; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
848; GFX9-NEXT:    s_mov_b64 s[8:9], exec
849; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
850; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
851; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
852; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
853; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
854; GFX9-NEXT:    s_cbranch_execz .LBB4_2
855; GFX9-NEXT:  ; %bb.1:
856; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX9-NEXT:    s_mov_b32 s12, s6
858; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
859; GFX9-NEXT:    s_mov_b32 s13, s7
860; GFX9-NEXT:    s_mul_i32 s7, s3, s6
861; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
862; GFX9-NEXT:    s_add_i32 s8, s8, s7
863; GFX9-NEXT:    s_mul_i32 s6, s2, s6
864; GFX9-NEXT:    s_mov_b32 s15, 0xf000
865; GFX9-NEXT:    s_mov_b32 s14, -1
866; GFX9-NEXT:    v_mov_b32_e32 v0, s6
867; GFX9-NEXT:    v_mov_b32_e32 v1, s8
868; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
869; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
870; GFX9-NEXT:    s_waitcnt vmcnt(0)
871; GFX9-NEXT:    buffer_wbinvl1_vol
872; GFX9-NEXT:  .LBB4_2:
873; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
874; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
875; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
876; GFX9-NEXT:    v_mov_b32_e32 v0, s0
877; GFX9-NEXT:    v_mov_b32_e32 v1, s1
878; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
880; GFX9-NEXT:    s_mov_b32 s7, 0xf000
881; GFX9-NEXT:    s_mov_b32 s6, -1
882; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
883; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
884; GFX9-NEXT:    s_endpgm
885;
886; GFX1064-LABEL: add_i64_uniform:
887; GFX1064:       ; %bb.0: ; %entry
888; GFX1064-NEXT:    s_clause 0x1
889; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
890; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
891; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
892; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
893; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
894; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
895; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
896; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
897; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
898; GFX1064-NEXT:  ; %bb.1:
899; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
900; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
901; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
902; GFX1064-NEXT:    s_mul_i32 s9, s3, s8
903; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
904; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
905; GFX1064-NEXT:    s_add_i32 s10, s10, s9
906; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
907; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
908; GFX1064-NEXT:    s_mov_b32 s10, -1
909; GFX1064-NEXT:    s_mov_b32 s8, s6
910; GFX1064-NEXT:    s_mov_b32 s9, s7
911; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
912; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
913; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
914; GFX1064-NEXT:    s_waitcnt vmcnt(0)
915; GFX1064-NEXT:    buffer_gl0_inv
916; GFX1064-NEXT:    buffer_gl1_inv
917; GFX1064-NEXT:  .LBB4_2:
918; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
919; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
920; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
921; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
922; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
923; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
924; GFX1064-NEXT:    s_mov_b32 s6, -1
925; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
926; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2]
927; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
928; GFX1064-NEXT:    s_endpgm
929;
930; GFX1032-LABEL: add_i64_uniform:
931; GFX1032:       ; %bb.0: ; %entry
932; GFX1032-NEXT:    s_clause 0x1
933; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
934; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
935; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
936; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
937; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
938; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
939; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
940; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
941; GFX1032-NEXT:  ; %bb.1:
942; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s8
943; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
944; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
945; GFX1032-NEXT:    s_mul_i32 s8, s3, s1
946; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
947; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
948; GFX1032-NEXT:    s_add_i32 s9, s9, s8
949; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
950; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
951; GFX1032-NEXT:    s_mov_b32 s10, -1
952; GFX1032-NEXT:    s_mov_b32 s8, s6
953; GFX1032-NEXT:    s_mov_b32 s9, s7
954; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
955; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
956; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
957; GFX1032-NEXT:    s_waitcnt vmcnt(0)
958; GFX1032-NEXT:    buffer_gl0_inv
959; GFX1032-NEXT:    buffer_gl1_inv
960; GFX1032-NEXT:  .LBB4_2:
961; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
962; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
963; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
964; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
965; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
966; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
967; GFX1032-NEXT:    s_mov_b32 s6, -1
968; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], null, s2, v2, s[0:1]
969; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], null, s3, v2, v[1:2]
970; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
971; GFX1032-NEXT:    s_endpgm
972entry:
973  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
974  store i64 %old, i64 addrspace(1)* %out
975  ret void
976}
977
978define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
979; GFX7LESS-LABEL: add_i64_varying:
980; GFX7LESS:       ; %bb.0: ; %entry
981; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
982; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
983; GFX7LESS-NEXT:    s_mov_b32 s6, -1
984; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
985; GFX7LESS-NEXT:    s_mov_b32 s10, s6
986; GFX7LESS-NEXT:    s_mov_b32 s11, s7
987; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX7LESS-NEXT:    s_mov_b32 s8, s2
989; GFX7LESS-NEXT:    s_mov_b32 s9, s3
990; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
991; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
992; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
993; GFX7LESS-NEXT:    buffer_wbinvl1
994; GFX7LESS-NEXT:    s_mov_b32 s4, s0
995; GFX7LESS-NEXT:    s_mov_b32 s5, s1
996; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
997; GFX7LESS-NEXT:    s_endpgm
998;
999; GFX89-LABEL: add_i64_varying:
1000; GFX89:       ; %bb.0: ; %entry
1001; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1002; GFX89-NEXT:    s_mov_b32 s7, 0xf000
1003; GFX89-NEXT:    s_mov_b32 s6, -1
1004; GFX89-NEXT:    s_mov_b32 s10, s6
1005; GFX89-NEXT:    s_mov_b32 s11, s7
1006; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX89-NEXT:    s_mov_b32 s8, s2
1008; GFX89-NEXT:    s_mov_b32 s9, s3
1009; GFX89-NEXT:    v_mov_b32_e32 v1, 0
1010; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1011; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1012; GFX89-NEXT:    s_waitcnt vmcnt(0)
1013; GFX89-NEXT:    buffer_wbinvl1_vol
1014; GFX89-NEXT:    s_mov_b32 s4, s0
1015; GFX89-NEXT:    s_mov_b32 s5, s1
1016; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1017; GFX89-NEXT:    s_endpgm
1018;
1019; GFX10-LABEL: add_i64_varying:
1020; GFX10:       ; %bb.0: ; %entry
1021; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1022; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1023; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1024; GFX10-NEXT:    s_mov_b32 s6, -1
1025; GFX10-NEXT:    s_mov_b32 s11, s7
1026; GFX10-NEXT:    s_mov_b32 s10, s6
1027; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1028; GFX10-NEXT:    s_mov_b32 s8, s2
1029; GFX10-NEXT:    s_mov_b32 s9, s3
1030; GFX10-NEXT:    s_mov_b32 s4, s0
1031; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1032; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1033; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1034; GFX10-NEXT:    s_waitcnt vmcnt(0)
1035; GFX10-NEXT:    buffer_gl0_inv
1036; GFX10-NEXT:    buffer_gl1_inv
1037; GFX10-NEXT:    s_mov_b32 s5, s1
1038; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1039; GFX10-NEXT:    s_endpgm
1040entry:
1041  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1042  %zext = zext i32 %lane to i64
1043  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
1044  store i64 %old, i64 addrspace(1)* %out
1045  ret void
1046}
1047
1048define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
1049; GFX7LESS-LABEL: sub_i32_constant:
1050; GFX7LESS:       ; %bb.0: ; %entry
1051; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1052; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1053; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1054; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1055; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1056; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1057; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1058; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_2
1059; GFX7LESS-NEXT:  ; %bb.1:
1060; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1061; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1062; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
1063; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1064; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1065; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1066; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1067; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1068; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1069; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1070; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1071; GFX7LESS-NEXT:    buffer_wbinvl1
1072; GFX7LESS-NEXT:  .LBB6_2:
1073; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1074; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1076; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1077; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1078; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1079; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1080; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1081; GFX7LESS-NEXT:    s_endpgm
1082;
1083; GFX8-LABEL: sub_i32_constant:
1084; GFX8:       ; %bb.0: ; %entry
1085; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1086; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1087; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1088; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1089; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1090; GFX8-NEXT:    ; implicit-def: $vgpr1
1091; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1092; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1093; GFX8-NEXT:  ; %bb.1:
1094; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1095; GFX8-NEXT:    s_mov_b32 s8, s2
1096; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1097; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1098; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1099; GFX8-NEXT:    s_mov_b32 s10, -1
1100; GFX8-NEXT:    s_mov_b32 s9, s3
1101; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1102; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1103; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1104; GFX8-NEXT:    s_waitcnt vmcnt(0)
1105; GFX8-NEXT:    buffer_wbinvl1_vol
1106; GFX8-NEXT:  .LBB6_2:
1107; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1108; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
1109; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1110; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1112; GFX8-NEXT:    s_mov_b32 s2, -1
1113; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1114; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1115; GFX8-NEXT:    s_endpgm
1116;
1117; GFX9-LABEL: sub_i32_constant:
1118; GFX9:       ; %bb.0: ; %entry
1119; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1120; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1121; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1122; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1123; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1124; GFX9-NEXT:    ; implicit-def: $vgpr1
1125; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1126; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1127; GFX9-NEXT:  ; %bb.1:
1128; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1129; GFX9-NEXT:    s_mov_b32 s8, s2
1130; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1131; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1132; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1133; GFX9-NEXT:    s_mov_b32 s10, -1
1134; GFX9-NEXT:    s_mov_b32 s9, s3
1135; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1136; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1137; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1138; GFX9-NEXT:    s_waitcnt vmcnt(0)
1139; GFX9-NEXT:    buffer_wbinvl1_vol
1140; GFX9-NEXT:  .LBB6_2:
1141; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1142; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1143; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1144; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1145; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1146; GFX9-NEXT:    s_mov_b32 s2, -1
1147; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1148; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1149; GFX9-NEXT:    s_endpgm
1150;
1151; GFX1064-LABEL: sub_i32_constant:
1152; GFX1064:       ; %bb.0: ; %entry
1153; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1154; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1155; GFX1064-NEXT:    ; implicit-def: $vgpr1
1156; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1157; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1158; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1159; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1160; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
1161; GFX1064-NEXT:  ; %bb.1:
1162; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1163; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
1164; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
1165; GFX1064-NEXT:    s_mov_b32 s10, -1
1166; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1167; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1168; GFX1064-NEXT:    s_mov_b32 s8, s2
1169; GFX1064-NEXT:    s_mov_b32 s9, s3
1170; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1171; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1172; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1173; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1174; GFX1064-NEXT:    buffer_gl0_inv
1175; GFX1064-NEXT:    buffer_gl1_inv
1176; GFX1064-NEXT:  .LBB6_2:
1177; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1178; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1179; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1181; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1182; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1183; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1184; GFX1064-NEXT:    s_mov_b32 s2, -1
1185; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1186; GFX1064-NEXT:    s_endpgm
1187;
1188; GFX1032-LABEL: sub_i32_constant:
1189; GFX1032:       ; %bb.0: ; %entry
1190; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1191; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1192; GFX1032-NEXT:    ; implicit-def: $vgpr1
1193; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1194; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1195; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1196; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
1197; GFX1032-NEXT:  ; %bb.1:
1198; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1199; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1200; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
1201; GFX1032-NEXT:    s_mov_b32 s10, -1
1202; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1203; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1204; GFX1032-NEXT:    s_mov_b32 s8, s2
1205; GFX1032-NEXT:    s_mov_b32 s9, s3
1206; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1207; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1208; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1209; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1210; GFX1032-NEXT:    buffer_gl0_inv
1211; GFX1032-NEXT:    buffer_gl1_inv
1212; GFX1032-NEXT:  .LBB6_2:
1213; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1214; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1215; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1216; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1217; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1218; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1219; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1220; GFX1032-NEXT:    s_mov_b32 s2, -1
1221; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1222; GFX1032-NEXT:    s_endpgm
1223entry:
1224  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
1225  store i32 %old, i32 addrspace(1)* %out
1226  ret void
1227}
1228
1229define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
1230; GFX7LESS-LABEL: sub_i32_uniform:
1231; GFX7LESS:       ; %bb.0: ; %entry
1232; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1233; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1234; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
1235; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1236; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1237; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1238; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1239; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1240; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1241; GFX7LESS-NEXT:  ; %bb.1:
1242; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1243; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1244; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1245; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
1246; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1247; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1248; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1249; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
1250; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1251; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1252; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1253; GFX7LESS-NEXT:    buffer_wbinvl1
1254; GFX7LESS-NEXT:  .LBB7_2:
1255; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1256; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1257; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1258; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1259; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1260; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
1261; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1262; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1263; GFX7LESS-NEXT:    s_endpgm
1264;
1265; GFX8-LABEL: sub_i32_uniform:
1266; GFX8:       ; %bb.0: ; %entry
1267; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1268; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
1269; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1270; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1271; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1272; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1273; GFX8-NEXT:    ; implicit-def: $vgpr1
1274; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1275; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1276; GFX8-NEXT:  ; %bb.1:
1277; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1278; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX8-NEXT:    s_mul_i32 s2, s8, s2
1280; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1281; GFX8-NEXT:    s_mov_b32 s14, -1
1282; GFX8-NEXT:    s_mov_b32 s12, s6
1283; GFX8-NEXT:    s_mov_b32 s13, s7
1284; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1285; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1286; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1287; GFX8-NEXT:    s_waitcnt vmcnt(0)
1288; GFX8-NEXT:    buffer_wbinvl1_vol
1289; GFX8-NEXT:  .LBB7_2:
1290; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1291; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1292; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1293; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1294; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1295; GFX8-NEXT:    s_mov_b32 s6, -1
1296; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1297; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1298; GFX8-NEXT:    s_endpgm
1299;
1300; GFX9-LABEL: sub_i32_uniform:
1301; GFX9:       ; %bb.0: ; %entry
1302; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1303; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
1304; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1305; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1306; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1307; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1308; GFX9-NEXT:    ; implicit-def: $vgpr1
1309; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1310; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1311; GFX9-NEXT:  ; %bb.1:
1312; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1313; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1314; GFX9-NEXT:    s_mul_i32 s2, s8, s2
1315; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1316; GFX9-NEXT:    s_mov_b32 s14, -1
1317; GFX9-NEXT:    s_mov_b32 s12, s6
1318; GFX9-NEXT:    s_mov_b32 s13, s7
1319; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1320; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1321; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1322; GFX9-NEXT:    s_waitcnt vmcnt(0)
1323; GFX9-NEXT:    buffer_wbinvl1_vol
1324; GFX9-NEXT:  .LBB7_2:
1325; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1326; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1327; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1328; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1329; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1330; GFX9-NEXT:    s_mov_b32 s6, -1
1331; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1332; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1333; GFX9-NEXT:    s_endpgm
1334;
1335; GFX1064-LABEL: sub_i32_uniform:
1336; GFX1064:       ; %bb.0: ; %entry
1337; GFX1064-NEXT:    s_clause 0x1
1338; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1339; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
1340; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1341; GFX1064-NEXT:    ; implicit-def: $vgpr1
1342; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1343; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1344; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1345; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1346; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1347; GFX1064-NEXT:  ; %bb.1:
1348; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1349; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
1350; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1351; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
1352; GFX1064-NEXT:    s_mov_b32 s14, -1
1353; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
1354; GFX1064-NEXT:    s_mov_b32 s12, s6
1355; GFX1064-NEXT:    s_mov_b32 s13, s7
1356; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1357; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1358; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1359; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1360; GFX1064-NEXT:    buffer_gl0_inv
1361; GFX1064-NEXT:    buffer_gl1_inv
1362; GFX1064-NEXT:  .LBB7_2:
1363; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1364; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1365; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1366; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
1367; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1368; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1369; GFX1064-NEXT:    s_mov_b32 s6, -1
1370; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1371; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1372; GFX1064-NEXT:    s_endpgm
1373;
1374; GFX1032-LABEL: sub_i32_uniform:
1375; GFX1032:       ; %bb.0: ; %entry
1376; GFX1032-NEXT:    s_clause 0x1
1377; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1378; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x34
1379; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1380; GFX1032-NEXT:    ; implicit-def: $vgpr1
1381; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1382; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1383; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1384; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1385; GFX1032-NEXT:  ; %bb.1:
1386; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1387; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1388; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1390; GFX1032-NEXT:    s_mov_b32 s10, -1
1391; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
1392; GFX1032-NEXT:    s_mov_b32 s8, s6
1393; GFX1032-NEXT:    s_mov_b32 s9, s7
1394; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1395; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1396; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1397; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1398; GFX1032-NEXT:    buffer_gl0_inv
1399; GFX1032-NEXT:    buffer_gl1_inv
1400; GFX1032-NEXT:  .LBB7_2:
1401; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1402; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1403; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1404; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1405; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1406; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1407; GFX1032-NEXT:    s_mov_b32 s6, -1
1408; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1409; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1410; GFX1032-NEXT:    s_endpgm
1411entry:
1412  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
1413  store i32 %old, i32 addrspace(1)* %out
1414  ret void
1415}
1416
1417define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
1418; GFX7LESS-LABEL: sub_i32_varying:
1419; GFX7LESS:       ; %bb.0: ; %entry
1420; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1421; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1422; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1423; GFX7LESS-NEXT:    s_mov_b32 s10, s6
1424; GFX7LESS-NEXT:    s_mov_b32 s11, s7
1425; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1426; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1427; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1428; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1429; GFX7LESS-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1430; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1431; GFX7LESS-NEXT:    buffer_wbinvl1
1432; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1433; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1434; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1435; GFX7LESS-NEXT:    s_endpgm
1436;
1437; GFX8-LABEL: sub_i32_varying:
1438; GFX8:       ; %bb.0: ; %entry
1439; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1440; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1441; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1442; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1443; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1444; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1445; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1446; GFX8-NEXT:    s_not_b64 exec, exec
1447; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1448; GFX8-NEXT:    s_not_b64 exec, exec
1449; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1450; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1451; GFX8-NEXT:    s_nop 1
1452; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1453; GFX8-NEXT:    s_nop 1
1454; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1455; GFX8-NEXT:    s_nop 1
1456; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1457; GFX8-NEXT:    s_nop 1
1458; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1459; GFX8-NEXT:    s_nop 1
1460; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1461; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1462; GFX8-NEXT:    s_nop 0
1463; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1464; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1465; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1466; GFX8-NEXT:    ; implicit-def: $vgpr0
1467; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1468; GFX8-NEXT:    s_cbranch_execz .LBB8_2
1469; GFX8-NEXT:  ; %bb.1:
1470; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1471; GFX8-NEXT:    s_mov_b32 s10, -1
1472; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX8-NEXT:    s_mov_b32 s8, s2
1474; GFX8-NEXT:    s_mov_b32 s9, s3
1475; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1476; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1477; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1478; GFX8-NEXT:    s_waitcnt vmcnt(0)
1479; GFX8-NEXT:    buffer_wbinvl1_vol
1480; GFX8-NEXT:  .LBB8_2:
1481; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1482; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1483; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1484; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1485; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1486; GFX8-NEXT:    s_mov_b32 s2, -1
1487; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1488; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1489; GFX8-NEXT:    s_endpgm
1490;
1491; GFX9-LABEL: sub_i32_varying:
1492; GFX9:       ; %bb.0: ; %entry
1493; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1494; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1495; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1496; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1497; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1498; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1499; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1500; GFX9-NEXT:    s_not_b64 exec, exec
1501; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1502; GFX9-NEXT:    s_not_b64 exec, exec
1503; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1504; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1505; GFX9-NEXT:    s_nop 1
1506; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1507; GFX9-NEXT:    s_nop 1
1508; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1509; GFX9-NEXT:    s_nop 1
1510; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1511; GFX9-NEXT:    s_nop 1
1512; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1513; GFX9-NEXT:    s_nop 1
1514; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1515; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1516; GFX9-NEXT:    s_nop 0
1517; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1518; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1519; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1520; GFX9-NEXT:    ; implicit-def: $vgpr0
1521; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1522; GFX9-NEXT:    s_cbranch_execz .LBB8_2
1523; GFX9-NEXT:  ; %bb.1:
1524; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1525; GFX9-NEXT:    s_mov_b32 s10, -1
1526; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1527; GFX9-NEXT:    s_mov_b32 s8, s2
1528; GFX9-NEXT:    s_mov_b32 s9, s3
1529; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1530; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1531; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1532; GFX9-NEXT:    s_waitcnt vmcnt(0)
1533; GFX9-NEXT:    buffer_wbinvl1_vol
1534; GFX9-NEXT:  .LBB8_2:
1535; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1536; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1537; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1538; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1539; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1540; GFX9-NEXT:    s_mov_b32 s2, -1
1541; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1542; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1543; GFX9-NEXT:    s_endpgm
1544;
1545; GFX1064-LABEL: sub_i32_varying:
1546; GFX1064:       ; %bb.0: ; %entry
1547; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
1548; GFX1064-NEXT:    s_not_b64 exec, exec
1549; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1550; GFX1064-NEXT:    s_not_b64 exec, exec
1551; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
1552; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1553; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
1554; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1555; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1556; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1557; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
1558; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1559; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1560; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
1561; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
1562; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1563; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
1564; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1565; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
1566; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1567; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1568; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
1569; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
1570; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1571; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1572; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
1573; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
1574; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
1575; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
1576; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
1577; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1578; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
1579; GFX1064-NEXT:    s_mov_b32 s4, s9
1580; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
1581; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
1582; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1583; GFX1064-NEXT:    s_mov_b32 s6, -1
1584; GFX1064-NEXT:    ; implicit-def: $vgpr0
1585; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1586; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
1587; GFX1064-NEXT:  ; %bb.1:
1588; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
1589; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1590; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1591; GFX1064-NEXT:    s_mov_b32 s4, s2
1592; GFX1064-NEXT:    s_mov_b32 s5, s3
1593; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1594; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1595; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1596; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1597; GFX1064-NEXT:    buffer_gl0_inv
1598; GFX1064-NEXT:    buffer_gl1_inv
1599; GFX1064-NEXT:  .LBB8_2:
1600; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1601; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
1602; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1603; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1604; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
1605; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1606; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1607; GFX1064-NEXT:    s_mov_b32 s2, s6
1608; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1609; GFX1064-NEXT:    s_endpgm
1610;
1611; GFX1032-LABEL: sub_i32_varying:
1612; GFX1032:       ; %bb.0: ; %entry
1613; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
1614; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1615; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1616; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
1617; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
1618; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1619; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1620; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1621; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1622; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
1623; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1624; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
1625; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1626; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1627; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1628; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
1629; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
1630; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
1631; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1632; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1633; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1634; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
1635; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
1636; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
1637; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1638; GFX1032-NEXT:    s_mov_b32 s4, s6
1639; GFX1032-NEXT:    s_mov_b32 s6, -1
1640; GFX1032-NEXT:    ; implicit-def: $vgpr0
1641; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
1642; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
1643; GFX1032-NEXT:  ; %bb.1:
1644; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
1645; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1646; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1647; GFX1032-NEXT:    s_mov_b32 s4, s2
1648; GFX1032-NEXT:    s_mov_b32 s5, s3
1649; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1650; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1651; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1652; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1653; GFX1032-NEXT:    buffer_gl0_inv
1654; GFX1032-NEXT:    buffer_gl1_inv
1655; GFX1032-NEXT:  .LBB8_2:
1656; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1657; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
1658; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1659; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1660; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
1661; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1662; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1663; GFX1032-NEXT:    s_mov_b32 s2, s6
1664; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1665; GFX1032-NEXT:    s_endpgm
1666entry:
1667  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1668  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
1669  store i32 %old, i32 addrspace(1)* %out
1670  ret void
1671}
1672
1673define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
1674; GFX7LESS-LABEL: sub_i64_constant:
1675; GFX7LESS:       ; %bb.0: ; %entry
1676; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1677; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1678; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1679; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
1680; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1681; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1682; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1683; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_2
1684; GFX7LESS-NEXT:  ; %bb.1:
1685; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1686; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1687; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
1688; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1689; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1690; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1691; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1692; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1693; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1694; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1695; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
1696; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1697; GFX7LESS-NEXT:    buffer_wbinvl1
1698; GFX7LESS-NEXT:  .LBB9_2:
1699; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1700; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1701; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1702; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1703; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
1704; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
1705; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1706; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1707; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1708; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
1709; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1710; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1711; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1712; GFX7LESS-NEXT:    s_endpgm
1713;
1714; GFX8-LABEL: sub_i64_constant:
1715; GFX8:       ; %bb.0: ; %entry
1716; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1717; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1718; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1719; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1720; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1721; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1722; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1723; GFX8-NEXT:    s_cbranch_execz .LBB9_2
1724; GFX8-NEXT:  ; %bb.1:
1725; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1726; GFX8-NEXT:    s_mov_b32 s8, s2
1727; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1728; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1729; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1730; GFX8-NEXT:    s_mov_b32 s10, -1
1731; GFX8-NEXT:    s_mov_b32 s9, s3
1732; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1733; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1734; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1735; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
1736; GFX8-NEXT:    s_waitcnt vmcnt(0)
1737; GFX8-NEXT:    buffer_wbinvl1_vol
1738; GFX8-NEXT:  .LBB9_2:
1739; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1740; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1741; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
1742; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1743; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1744; GFX8-NEXT:    v_mov_b32_e32 v2, s5
1745; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1746; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1747; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1748; GFX8-NEXT:    s_mov_b32 s2, -1
1749; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1750; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1751; GFX8-NEXT:    s_endpgm
1752;
1753; GFX9-LABEL: sub_i64_constant:
1754; GFX9:       ; %bb.0: ; %entry
1755; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1756; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1757; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1758; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1759; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1760; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1761; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1762; GFX9-NEXT:    s_cbranch_execz .LBB9_2
1763; GFX9-NEXT:  ; %bb.1:
1764; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1765; GFX9-NEXT:    s_mov_b32 s8, s2
1766; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1767; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1768; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1769; GFX9-NEXT:    s_mov_b32 s10, -1
1770; GFX9-NEXT:    s_mov_b32 s9, s3
1771; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1772; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1773; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1774; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
1775; GFX9-NEXT:    s_waitcnt vmcnt(0)
1776; GFX9-NEXT:    buffer_wbinvl1_vol
1777; GFX9-NEXT:  .LBB9_2:
1778; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1779; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1780; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
1781; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1782; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1783; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1784; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
1785; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1786; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1787; GFX9-NEXT:    s_mov_b32 s2, -1
1788; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
1789; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1790; GFX9-NEXT:    s_endpgm
1791;
1792; GFX1064-LABEL: sub_i64_constant:
1793; GFX1064:       ; %bb.0: ; %entry
1794; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1795; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1796; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1797; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1798; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1799; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1800; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1801; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
1802; GFX1064-NEXT:  ; %bb.1:
1803; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1804; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
1805; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
1806; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
1807; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
1808; GFX1064-NEXT:    s_mov_b32 s10, -1
1809; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX1064-NEXT:    s_mov_b32 s8, s2
1811; GFX1064-NEXT:    s_mov_b32 s9, s3
1812; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1813; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1814; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
1815; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1816; GFX1064-NEXT:    buffer_gl0_inv
1817; GFX1064-NEXT:    buffer_gl1_inv
1818; GFX1064-NEXT:  .LBB9_2:
1819; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1820; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1821; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1822; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1823; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1824; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1825; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1826; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
1827; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
1828; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1829; GFX1064-NEXT:    s_mov_b32 s2, -1
1830; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1831; GFX1064-NEXT:    s_endpgm
1832;
1833; GFX1032-LABEL: sub_i64_constant:
1834; GFX1032:       ; %bb.0: ; %entry
1835; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1836; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1837; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1838; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1839; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1840; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1841; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
1842; GFX1032-NEXT:  ; %bb.1:
1843; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1844; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1845; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
1846; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1847; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1848; GFX1032-NEXT:    s_mov_b32 s10, -1
1849; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1850; GFX1032-NEXT:    s_mov_b32 s8, s2
1851; GFX1032-NEXT:    s_mov_b32 s9, s3
1852; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1853; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1854; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
1855; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1856; GFX1032-NEXT:    buffer_gl0_inv
1857; GFX1032-NEXT:    buffer_gl1_inv
1858; GFX1032-NEXT:  .LBB9_2:
1859; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1860; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1861; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1862; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1863; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
1864; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1865; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
1866; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
1867; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
1868; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1869; GFX1032-NEXT:    s_mov_b32 s2, -1
1870; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1871; GFX1032-NEXT:    s_endpgm
1872entry:
1873  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
1874  store i64 %old, i64 addrspace(1)* %out
1875  ret void
1876}
1877
1878define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
1879; GFX7LESS-LABEL: sub_i64_uniform:
1880; GFX7LESS:       ; %bb.0: ; %entry
1881; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
1882; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1883; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1884; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
1885; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
1886; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1887; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1888; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1889; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_2
1890; GFX7LESS-NEXT:  ; %bb.1:
1891; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1892; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1893; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1894; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1895; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1896; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1897; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
1898; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1899; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
1900; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
1901; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1902; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1903; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1904; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
1905; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1906; GFX7LESS-NEXT:    buffer_wbinvl1
1907; GFX7LESS-NEXT:  .LBB10_2:
1908; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1909; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1910; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1911; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1912; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1913; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
1914; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1915; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
1916; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
1917; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
1918; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1919; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
1920; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v2
1921; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
1922; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1923; GFX7LESS-NEXT:    s_endpgm
1924;
1925; GFX8-LABEL: sub_i64_uniform:
1926; GFX8:       ; %bb.0: ; %entry
1927; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1928; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1929; GFX8-NEXT:    s_mov_b64 s[8:9], exec
1930; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1931; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1932; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1933; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1934; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1935; GFX8-NEXT:    s_cbranch_execz .LBB10_2
1936; GFX8-NEXT:  ; %bb.1:
1937; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1938; GFX8-NEXT:    s_mov_b32 s12, s6
1939; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1940; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1941; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
1942; GFX8-NEXT:    s_mul_i32 s6, s1, s6
1943; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1944; GFX8-NEXT:    s_mov_b32 s14, -1
1945; GFX8-NEXT:    s_mov_b32 s13, s7
1946; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1947; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1948; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
1949; GFX8-NEXT:    s_waitcnt vmcnt(0)
1950; GFX8-NEXT:    buffer_wbinvl1_vol
1951; GFX8-NEXT:  .LBB10_2:
1952; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1953; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1954; GFX8-NEXT:    v_mul_lo_u32 v4, s1, v2
1955; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
1956; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1957; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
1958; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
1959; GFX8-NEXT:    v_mov_b32_e32 v3, s1
1960; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
1961; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1962; GFX8-NEXT:    s_mov_b32 s6, -1
1963; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
1964; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1965; GFX8-NEXT:    s_endpgm
1966;
1967; GFX9-LABEL: sub_i64_uniform:
1968; GFX9:       ; %bb.0: ; %entry
1969; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1970; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1971; GFX9-NEXT:    s_mov_b64 s[8:9], exec
1972; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1973; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1974; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1975; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1976; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1977; GFX9-NEXT:    s_cbranch_execz .LBB10_2
1978; GFX9-NEXT:  ; %bb.1:
1979; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX9-NEXT:    s_mov_b32 s12, s6
1981; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1982; GFX9-NEXT:    s_mov_b32 s13, s7
1983; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1984; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1985; GFX9-NEXT:    s_add_i32 s8, s8, s7
1986; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1987; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1988; GFX9-NEXT:    s_mov_b32 s14, -1
1989; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1990; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1991; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1992; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
1993; GFX9-NEXT:    s_waitcnt vmcnt(0)
1994; GFX9-NEXT:    buffer_wbinvl1_vol
1995; GFX9-NEXT:  .LBB10_2:
1996; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1997; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1998; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
1999; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2000; GFX9-NEXT:    s_mov_b32 s6, -1
2001; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
2002; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2003; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2004; GFX9-NEXT:    v_mov_b32_e32 v1, v4
2005; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2006; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
2007; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2008; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2009; GFX9-NEXT:    s_endpgm
2010;
2011; GFX1064-LABEL: sub_i64_uniform:
2012; GFX1064:       ; %bb.0: ; %entry
2013; GFX1064-NEXT:    s_clause 0x1
2014; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2015; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2016; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
2017; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2018; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2019; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2020; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2021; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2022; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2023; GFX1064-NEXT:  ; %bb.1:
2024; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2025; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
2026; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2027; GFX1064-NEXT:    s_mul_i32 s9, s3, s8
2028; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
2029; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
2030; GFX1064-NEXT:    s_add_i32 s10, s10, s9
2031; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
2032; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
2033; GFX1064-NEXT:    s_mov_b32 s10, -1
2034; GFX1064-NEXT:    s_mov_b32 s8, s6
2035; GFX1064-NEXT:    s_mov_b32 s9, s7
2036; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2037; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2038; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2039; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2040; GFX1064-NEXT:    buffer_gl0_inv
2041; GFX1064-NEXT:    buffer_gl1_inv
2042; GFX1064-NEXT:  .LBB10_2:
2043; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2044; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
2045; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2046; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
2047; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
2048; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
2049; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2050; GFX1064-NEXT:    s_mov_b32 s6, -1
2051; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5]
2052; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
2053; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
2054; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
2055; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2056; GFX1064-NEXT:    s_endpgm
2057;
2058; GFX1032-LABEL: sub_i64_uniform:
2059; GFX1032:       ; %bb.0: ; %entry
2060; GFX1032-NEXT:    s_clause 0x1
2061; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2062; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2063; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
2064; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2065; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
2066; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2067; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2068; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2069; GFX1032-NEXT:  ; %bb.1:
2070; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s8
2071; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
2072; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2073; GFX1032-NEXT:    s_mul_i32 s8, s3, s1
2074; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
2075; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
2076; GFX1032-NEXT:    s_add_i32 s9, s9, s8
2077; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
2078; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
2079; GFX1032-NEXT:    s_mov_b32 s10, -1
2080; GFX1032-NEXT:    s_mov_b32 s8, s6
2081; GFX1032-NEXT:    s_mov_b32 s9, s7
2082; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2083; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2084; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2085; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2086; GFX1032-NEXT:    buffer_gl0_inv
2087; GFX1032-NEXT:    buffer_gl1_inv
2088; GFX1032-NEXT:  .LBB10_2:
2089; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2090; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2091; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2092; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], null, s2, v2, 0
2093; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
2094; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
2095; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2096; GFX1032-NEXT:    s_mov_b32 s6, -1
2097; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], null, s3, v2, v[4:5]
2098; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
2099; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
2100; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2101; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2102; GFX1032-NEXT:    s_endpgm
2103entry:
2104  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
2105  store i64 %old, i64 addrspace(1)* %out
2106  ret void
2107}
2108
2109define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
2110; GFX7LESS-LABEL: sub_i64_varying:
2111; GFX7LESS:       ; %bb.0: ; %entry
2112; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2113; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2114; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2115; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2116; GFX7LESS-NEXT:    s_mov_b32 s10, s6
2117; GFX7LESS-NEXT:    s_mov_b32 s11, s7
2118; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2119; GFX7LESS-NEXT:    s_mov_b32 s8, s2
2120; GFX7LESS-NEXT:    s_mov_b32 s9, s3
2121; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2122; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2123; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
2124; GFX7LESS-NEXT:    buffer_wbinvl1
2125; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2126; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2127; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2128; GFX7LESS-NEXT:    s_endpgm
2129;
2130; GFX89-LABEL: sub_i64_varying:
2131; GFX89:       ; %bb.0: ; %entry
2132; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2133; GFX89-NEXT:    s_mov_b32 s7, 0xf000
2134; GFX89-NEXT:    s_mov_b32 s6, -1
2135; GFX89-NEXT:    s_mov_b32 s10, s6
2136; GFX89-NEXT:    s_mov_b32 s11, s7
2137; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
2138; GFX89-NEXT:    s_mov_b32 s8, s2
2139; GFX89-NEXT:    s_mov_b32 s9, s3
2140; GFX89-NEXT:    v_mov_b32_e32 v1, 0
2141; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2142; GFX89-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2143; GFX89-NEXT:    s_waitcnt vmcnt(0)
2144; GFX89-NEXT:    buffer_wbinvl1_vol
2145; GFX89-NEXT:    s_mov_b32 s4, s0
2146; GFX89-NEXT:    s_mov_b32 s5, s1
2147; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2148; GFX89-NEXT:    s_endpgm
2149;
2150; GFX10-LABEL: sub_i64_varying:
2151; GFX10:       ; %bb.0: ; %entry
2152; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2153; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2154; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
2155; GFX10-NEXT:    s_mov_b32 s6, -1
2156; GFX10-NEXT:    s_mov_b32 s11, s7
2157; GFX10-NEXT:    s_mov_b32 s10, s6
2158; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2159; GFX10-NEXT:    s_mov_b32 s8, s2
2160; GFX10-NEXT:    s_mov_b32 s9, s3
2161; GFX10-NEXT:    s_mov_b32 s4, s0
2162; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2163; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2164; GFX10-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2165; GFX10-NEXT:    s_waitcnt vmcnt(0)
2166; GFX10-NEXT:    buffer_gl0_inv
2167; GFX10-NEXT:    buffer_gl1_inv
2168; GFX10-NEXT:    s_mov_b32 s5, s1
2169; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2170; GFX10-NEXT:    s_endpgm
2171entry:
2172  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2173  %zext = zext i32 %lane to i64
2174  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
2175  store i64 %old, i64 addrspace(1)* %out
2176  ret void
2177}
2178