1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
3; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s
6; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9
10; Show what the atomic optimization pass will do for global pointers.
11
12define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
13; GFX7LESS-LABEL: add_i32_constant:
14; GFX7LESS:       ; %bb.0: ; %entry
15; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
16; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
17; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
18; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
19; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
20; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
21; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
22; GFX7LESS-NEXT:    s_cbranch_execz BB0_2
23; GFX7LESS-NEXT:  ; %bb.1:
24; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
25; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
26; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
27; GFX7LESS-NEXT:    s_mov_b32 s10, -1
28; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
29; GFX7LESS-NEXT:    s_mov_b32 s8, s2
30; GFX7LESS-NEXT:    s_mov_b32 s9, s3
31; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
32; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
33; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
34; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
35; GFX7LESS-NEXT:    buffer_wbinvl1
36; GFX7LESS-NEXT:  BB0_2:
37; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
38; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
39; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
40; GFX7LESS-NEXT:    s_mov_b32 s2, -1
41; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
42; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
43; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
44; GFX7LESS-NEXT:    s_endpgm
45;
46; GFX89-LABEL: add_i32_constant:
47; GFX89:       ; %bb.0: ; %entry
48; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
49; GFX89-NEXT:    s_mov_b64 s[6:7], exec
50; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
51; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
52; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX89-NEXT:    ; implicit-def: $vgpr1
54; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; GFX89-NEXT:    s_cbranch_execz BB0_2
56; GFX89-NEXT:  ; %bb.1:
57; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX89-NEXT:    s_mov_b32 s8, s2
59; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
60; GFX89-NEXT:    s_mul_i32 s2, s2, 5
61; GFX89-NEXT:    s_mov_b32 s11, 0xf000
62; GFX89-NEXT:    s_mov_b32 s10, -1
63; GFX89-NEXT:    s_mov_b32 s9, s3
64; GFX89-NEXT:    v_mov_b32_e32 v1, s2
65; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
66; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
67; GFX89-NEXT:    s_waitcnt vmcnt(0)
68; GFX89-NEXT:    buffer_wbinvl1_vol
69; GFX89-NEXT:  BB0_2:
70; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
71; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
72; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
73; GFX89-NEXT:    s_mov_b32 s3, 0xf000
74; GFX89-NEXT:    s_mov_b32 s2, -1
75; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
76; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
77; GFX89-NEXT:    s_endpgm
78;
79; GCN64-LABEL: add_i32_constant:
80; GCN64:       ; %bb.0: ; %entry
81; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
82; GCN64-NEXT:    s_mov_b64 s[6:7], exec
83; GCN64-NEXT:    ; implicit-def: $vgpr1
84; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
85; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
86; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
87; GCN64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
88; GCN64-NEXT:    s_cbranch_execz BB0_2
89; GCN64-NEXT:  ; %bb.1:
90; GCN64-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
91; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
92; GCN64-NEXT:    s_mul_i32 s6, s6, 5
93; GCN64-NEXT:    s_mov_b32 s10, -1
94; GCN64-NEXT:    v_mov_b32_e32 v1, s6
95; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
96; GCN64-NEXT:    s_mov_b32 s8, s2
97; GCN64-NEXT:    s_mov_b32 s9, s3
98; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
99; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
100; GCN64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
101; GCN64-NEXT:    s_waitcnt vmcnt(0)
102; GCN64-NEXT:    buffer_gl0_inv
103; GCN64-NEXT:    buffer_gl1_inv
104; GCN64-NEXT:  BB0_2:
105; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
106; GCN64-NEXT:    s_or_b64 exec, exec, s[4:5]
107; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
108; GCN64-NEXT:    v_readfirstlane_b32 s2, v1
109; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
110; GCN64-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
111; GCN64-NEXT:    s_mov_b32 s2, -1
112; GCN64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
113; GCN64-NEXT:    s_endpgm
114;
115; GCN32-LABEL: add_i32_constant:
116; GCN32:       ; %bb.0: ; %entry
117; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
118; GCN32-NEXT:    s_mov_b32 s5, exec_lo
119; GCN32-NEXT:    ; implicit-def: $vgpr1
120; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
121; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
122; GCN32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
123; GCN32-NEXT:    s_cbranch_execz BB0_2
124; GCN32-NEXT:  ; %bb.1:
125; GCN32-NEXT:    s_bcnt1_i32_b32 s5, s5
126; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
127; GCN32-NEXT:    s_mul_i32 s5, s5, 5
128; GCN32-NEXT:    s_mov_b32 s10, -1
129; GCN32-NEXT:    v_mov_b32_e32 v1, s5
130; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
131; GCN32-NEXT:    s_mov_b32 s8, s2
132; GCN32-NEXT:    s_mov_b32 s9, s3
133; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
134; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
135; GCN32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
136; GCN32-NEXT:    s_waitcnt vmcnt(0)
137; GCN32-NEXT:    buffer_gl0_inv
138; GCN32-NEXT:    buffer_gl1_inv
139; GCN32-NEXT:  BB0_2:
140; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
141; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
142; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
143; GCN32-NEXT:    v_readfirstlane_b32 s2, v1
144; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
145; GCN32-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
146; GCN32-NEXT:    s_mov_b32 s2, -1
147; GCN32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
148; GCN32-NEXT:    s_endpgm
149entry:
150  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
151  store i32 %old, i32 addrspace(1)* %out
152  ret void
153}
154
155define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
156; GFX7LESS-LABEL: add_i32_uniform:
157; GFX7LESS:       ; %bb.0: ; %entry
158; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
159; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
160; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xd
161; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
162; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
163; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
164; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
165; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
166; GFX7LESS-NEXT:    s_cbranch_execz BB1_2
167; GFX7LESS-NEXT:  ; %bb.1:
168; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
169; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
170; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
171; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
172; GFX7LESS-NEXT:    s_mov_b32 s14, -1
173; GFX7LESS-NEXT:    s_mov_b32 s12, s6
174; GFX7LESS-NEXT:    s_mov_b32 s13, s7
175; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
176; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
177; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
178; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
179; GFX7LESS-NEXT:    buffer_wbinvl1
180; GFX7LESS-NEXT:  BB1_2:
181; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
182; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
184; GFX7LESS-NEXT:    s_mov_b32 s6, -1
185; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
186; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
187; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s1, v0
188; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
189; GFX7LESS-NEXT:    s_endpgm
190;
191; GFX8-LABEL: add_i32_uniform:
192; GFX8:       ; %bb.0: ; %entry
193; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
194; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
195; GFX8-NEXT:    s_mov_b64 s[2:3], exec
196; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
197; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
198; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
199; GFX8-NEXT:    ; implicit-def: $vgpr1
200; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
201; GFX8-NEXT:    s_cbranch_execz BB1_2
202; GFX8-NEXT:  ; %bb.1:
203; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
204; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX8-NEXT:    s_mul_i32 s1, s0, s1
206; GFX8-NEXT:    s_mov_b32 s15, 0xf000
207; GFX8-NEXT:    s_mov_b32 s14, -1
208; GFX8-NEXT:    s_mov_b32 s12, s6
209; GFX8-NEXT:    s_mov_b32 s13, s7
210; GFX8-NEXT:    v_mov_b32_e32 v1, s1
211; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
212; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
213; GFX8-NEXT:    s_waitcnt vmcnt(0)
214; GFX8-NEXT:    buffer_wbinvl1_vol
215; GFX8-NEXT:  BB1_2:
216; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
217; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
219; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
220; GFX8-NEXT:    s_mov_b32 s7, 0xf000
221; GFX8-NEXT:    s_mov_b32 s6, -1
222; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
223; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
224; GFX8-NEXT:    s_endpgm
225;
226; GFX9-LABEL: add_i32_uniform:
227; GFX9:       ; %bb.0: ; %entry
228; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
229; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
230; GFX9-NEXT:    s_mov_b64 s[8:9], exec
231; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
232; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
233; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
234; GFX9-NEXT:    ; implicit-def: $vgpr1
235; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
236; GFX9-NEXT:    s_cbranch_execz BB1_2
237; GFX9-NEXT:  ; %bb.1:
238; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX9-NEXT:    s_mul_i32 s3, s2, s3
241; GFX9-NEXT:    s_mov_b32 s15, 0xf000
242; GFX9-NEXT:    s_mov_b32 s14, -1
243; GFX9-NEXT:    s_mov_b32 s12, s6
244; GFX9-NEXT:    s_mov_b32 s13, s7
245; GFX9-NEXT:    v_mov_b32_e32 v1, s3
246; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
247; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
248; GFX9-NEXT:    s_waitcnt vmcnt(0)
249; GFX9-NEXT:    buffer_wbinvl1_vol
250; GFX9-NEXT:  BB1_2:
251; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
252; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
254; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
255; GFX9-NEXT:    s_mov_b32 s7, 0xf000
256; GFX9-NEXT:    s_mov_b32 s6, -1
257; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
258; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
259; GFX9-NEXT:    s_endpgm
260;
261; GCN64-LABEL: add_i32_uniform:
262; GCN64:       ; %bb.0: ; %entry
263; GCN64-NEXT:    s_clause 0x1
264; GCN64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
265; GCN64-NEXT:    s_load_dword s2, s[0:1], 0x34
266; GCN64-NEXT:    s_mov_b64 s[8:9], exec
267; GCN64-NEXT:    ; implicit-def: $vgpr1
268; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
269; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s9, v0
270; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
271; GCN64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
272; GCN64-NEXT:    s_cbranch_execz BB1_2
273; GCN64-NEXT:  ; %bb.1:
274; GCN64-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
275; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
276; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
277; GCN64-NEXT:    s_mul_i32 s3, s2, s3
278; GCN64-NEXT:    s_mov_b32 s10, -1
279; GCN64-NEXT:    v_mov_b32_e32 v1, s3
280; GCN64-NEXT:    s_mov_b32 s8, s6
281; GCN64-NEXT:    s_mov_b32 s9, s7
282; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
283; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
284; GCN64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
285; GCN64-NEXT:    s_waitcnt vmcnt(0)
286; GCN64-NEXT:    buffer_gl0_inv
287; GCN64-NEXT:    buffer_gl1_inv
288; GCN64-NEXT:  BB1_2:
289; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
290; GCN64-NEXT:    s_or_b64 exec, exec, s[0:1]
291; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
292; GCN64-NEXT:    v_mul_lo_u32 v0, s2, v0
293; GCN64-NEXT:    v_readfirstlane_b32 s0, v1
294; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
295; GCN64-NEXT:    s_mov_b32 s6, -1
296; GCN64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
297; GCN64-NEXT:    buffer_store_dword v0, off, s[4:7], 0
298; GCN64-NEXT:    s_endpgm
299;
300; GCN32-LABEL: add_i32_uniform:
301; GCN32:       ; %bb.0: ; %entry
302; GCN32-NEXT:    s_clause 0x1
303; GCN32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
304; GCN32-NEXT:    s_load_dword s2, s[0:1], 0x34
305; GCN32-NEXT:    s_mov_b32 s3, exec_lo
306; GCN32-NEXT:    ; implicit-def: $vgpr1
307; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
308; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
309; GCN32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
310; GCN32-NEXT:    s_cbranch_execz BB1_2
311; GCN32-NEXT:  ; %bb.1:
312; GCN32-NEXT:    s_bcnt1_i32_b32 s1, s3
313; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
314; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
315; GCN32-NEXT:    s_mul_i32 s1, s2, s1
316; GCN32-NEXT:    s_mov_b32 s10, -1
317; GCN32-NEXT:    v_mov_b32_e32 v1, s1
318; GCN32-NEXT:    s_mov_b32 s8, s6
319; GCN32-NEXT:    s_mov_b32 s9, s7
320; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
321; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
322; GCN32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
323; GCN32-NEXT:    s_waitcnt vmcnt(0)
324; GCN32-NEXT:    buffer_gl0_inv
325; GCN32-NEXT:    buffer_gl1_inv
326; GCN32-NEXT:  BB1_2:
327; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
328; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
329; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
330; GCN32-NEXT:    v_mul_lo_u32 v0, s2, v0
331; GCN32-NEXT:    v_readfirstlane_b32 s0, v1
332; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
333; GCN32-NEXT:    s_mov_b32 s6, -1
334; GCN32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
335; GCN32-NEXT:    buffer_store_dword v0, off, s[4:7], 0
336; GCN32-NEXT:    s_endpgm
337entry:
338  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
339  store i32 %old, i32 addrspace(1)* %out
340  ret void
341}
342
343define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
344; GFX7LESS-LABEL: add_i32_varying:
345; GFX7LESS:       ; %bb.0: ; %entry
346; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
347; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
348; GFX7LESS-NEXT:    s_mov_b32 s6, -1
349; GFX7LESS-NEXT:    s_mov_b32 s10, s6
350; GFX7LESS-NEXT:    s_mov_b32 s11, s7
351; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
352; GFX7LESS-NEXT:    s_mov_b32 s8, s2
353; GFX7LESS-NEXT:    s_mov_b32 s9, s3
354; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
355; GFX7LESS-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
356; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
357; GFX7LESS-NEXT:    buffer_wbinvl1
358; GFX7LESS-NEXT:    s_mov_b32 s4, s0
359; GFX7LESS-NEXT:    s_mov_b32 s5, s1
360; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
361; GFX7LESS-NEXT:    s_endpgm
362;
363; GFX8-LABEL: add_i32_varying:
364; GFX8:       ; %bb.0: ; %entry
365; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
366; GFX8-NEXT:    v_mov_b32_e32 v2, v0
367; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
368; GFX8-NEXT:    v_mov_b32_e32 v1, 0
369; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
370; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
371; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
372; GFX8-NEXT:    s_not_b64 exec, exec
373; GFX8-NEXT:    v_mov_b32_e32 v2, 0
374; GFX8-NEXT:    s_not_b64 exec, exec
375; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
376; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
377; GFX8-NEXT:    s_nop 1
378; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
379; GFX8-NEXT:    s_nop 1
380; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
381; GFX8-NEXT:    s_nop 1
382; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
383; GFX8-NEXT:    s_nop 1
384; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
385; GFX8-NEXT:    s_nop 1
386; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
387; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
388; GFX8-NEXT:    s_nop 0
389; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
390; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
391; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
392; GFX8-NEXT:    ; implicit-def: $vgpr0
393; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
394; GFX8-NEXT:    s_cbranch_execz BB2_2
395; GFX8-NEXT:  ; %bb.1:
396; GFX8-NEXT:    s_mov_b32 s11, 0xf000
397; GFX8-NEXT:    s_mov_b32 s10, -1
398; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX8-NEXT:    s_mov_b32 s8, s2
400; GFX8-NEXT:    s_mov_b32 s9, s3
401; GFX8-NEXT:    v_mov_b32_e32 v0, s6
402; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
403; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
404; GFX8-NEXT:    s_waitcnt vmcnt(0)
405; GFX8-NEXT:    buffer_wbinvl1_vol
406; GFX8-NEXT:  BB2_2:
407; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
408; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
409; GFX8-NEXT:    v_mov_b32_e32 v0, v1
410; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX8-NEXT:    s_mov_b32 s3, 0xf000
412; GFX8-NEXT:    s_mov_b32 s2, -1
413; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
414; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
415; GFX8-NEXT:    s_endpgm
416;
417; GFX9-LABEL: add_i32_varying:
418; GFX9:       ; %bb.0: ; %entry
419; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
420; GFX9-NEXT:    v_mov_b32_e32 v2, v0
421; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
422; GFX9-NEXT:    v_mov_b32_e32 v1, 0
423; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
424; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
425; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
426; GFX9-NEXT:    s_not_b64 exec, exec
427; GFX9-NEXT:    v_mov_b32_e32 v2, 0
428; GFX9-NEXT:    s_not_b64 exec, exec
429; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
430; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX9-NEXT:    s_nop 1
432; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX9-NEXT:    s_nop 1
434; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
435; GFX9-NEXT:    s_nop 1
436; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
437; GFX9-NEXT:    s_nop 1
438; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
439; GFX9-NEXT:    s_nop 1
440; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
441; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
442; GFX9-NEXT:    s_nop 0
443; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
444; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
445; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
446; GFX9-NEXT:    ; implicit-def: $vgpr0
447; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
448; GFX9-NEXT:    s_cbranch_execz BB2_2
449; GFX9-NEXT:  ; %bb.1:
450; GFX9-NEXT:    s_mov_b32 s11, 0xf000
451; GFX9-NEXT:    s_mov_b32 s10, -1
452; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
453; GFX9-NEXT:    s_mov_b32 s8, s2
454; GFX9-NEXT:    s_mov_b32 s9, s3
455; GFX9-NEXT:    v_mov_b32_e32 v0, s6
456; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
457; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
458; GFX9-NEXT:    s_waitcnt vmcnt(0)
459; GFX9-NEXT:    buffer_wbinvl1_vol
460; GFX9-NEXT:  BB2_2:
461; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
462; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
463; GFX9-NEXT:    v_mov_b32_e32 v0, v1
464; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
465; GFX9-NEXT:    s_mov_b32 s3, 0xf000
466; GFX9-NEXT:    s_mov_b32 s2, -1
467; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
468; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
469; GFX9-NEXT:    s_endpgm
470;
471; GCN64-LABEL: add_i32_varying:
472; GCN64:       ; %bb.0: ; %entry
473; GCN64-NEXT:    v_mov_b32_e32 v1, v0
474; GCN64-NEXT:    s_not_b64 exec, exec
475; GCN64-NEXT:    v_mov_b32_e32 v1, 0
476; GCN64-NEXT:    s_not_b64 exec, exec
477; GCN64-NEXT:    s_or_saveexec_b64 s[2:3], -1
478; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
479; GCN64-NEXT:    v_mov_b32_e32 v3, 0
480; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
481; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
482; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
483; GCN64-NEXT:    v_mov_b32_e32 v2, v1
484; GCN64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
485; GCN64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
486; GCN64-NEXT:    v_readlane_b32 s4, v1, 31
487; GCN64-NEXT:    v_mov_b32_e32 v2, s4
488; GCN64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
489; GCN64-NEXT:    v_readlane_b32 s6, v1, 15
490; GCN64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
491; GCN64-NEXT:    s_mov_b64 exec, s[2:3]
492; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
493; GCN64-NEXT:    s_or_saveexec_b64 s[4:5], -1
494; GCN64-NEXT:    v_readlane_b32 s7, v1, 31
495; GCN64-NEXT:    v_writelane_b32 v3, s6, 16
496; GCN64-NEXT:    s_mov_b64 exec, s[4:5]
497; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
498; GCN64-NEXT:    s_or_saveexec_b64 s[4:5], -1
499; GCN64-NEXT:    v_readlane_b32 s8, v1, 47
500; GCN64-NEXT:    v_readlane_b32 s9, v1, 63
501; GCN64-NEXT:    v_writelane_b32 v3, s7, 32
502; GCN64-NEXT:    s_mov_b64 exec, s[4:5]
503; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
504; GCN64-NEXT:    s_or_saveexec_b64 s[6:7], -1
505; GCN64-NEXT:    s_mov_b32 s4, s9
506; GCN64-NEXT:    v_writelane_b32 v3, s8, 48
507; GCN64-NEXT:    s_mov_b64 exec, s[6:7]
508; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
509; GCN64-NEXT:    s_mov_b32 s6, -1
510; GCN64-NEXT:    ; implicit-def: $vgpr0
511; GCN64-NEXT:    s_and_saveexec_b64 s[8:9], vcc
512; GCN64-NEXT:    s_cbranch_execz BB2_2
513; GCN64-NEXT:  ; %bb.1:
514; GCN64-NEXT:    v_mov_b32_e32 v0, s4
515; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
516; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
517; GCN64-NEXT:    s_mov_b32 s4, s2
518; GCN64-NEXT:    s_mov_b32 s5, s3
519; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
520; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
521; GCN64-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
522; GCN64-NEXT:    s_waitcnt vmcnt(0)
523; GCN64-NEXT:    buffer_gl0_inv
524; GCN64-NEXT:    buffer_gl1_inv
525; GCN64-NEXT:  BB2_2:
526; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
527; GCN64-NEXT:    s_or_b64 exec, exec, s[8:9]
528; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
529; GCN64-NEXT:    v_readfirstlane_b32 s2, v0
530; GCN64-NEXT:    v_mov_b32_e32 v0, v3
531; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
532; GCN64-NEXT:    v_add_nc_u32_e32 v0, s2, v0
533; GCN64-NEXT:    s_mov_b32 s2, s6
534; GCN64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
535; GCN64-NEXT:    s_endpgm
536;
537; GCN32-LABEL: add_i32_varying:
538; GCN32:       ; %bb.0: ; %entry
539; GCN32-NEXT:    v_mov_b32_e32 v1, v0
540; GCN32-NEXT:    s_not_b32 exec_lo, exec_lo
541; GCN32-NEXT:    v_mov_b32_e32 v1, 0
542; GCN32-NEXT:    s_not_b32 exec_lo, exec_lo
543; GCN32-NEXT:    s_or_saveexec_b32 s2, -1
544; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
545; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
546; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
547; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
548; GCN32-NEXT:    v_mov_b32_e32 v2, v1
549; GCN32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
550; GCN32-NEXT:    s_mov_b32 exec_lo, s2
551; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
552; GCN32-NEXT:    s_or_saveexec_b32 s4, -1
553; GCN32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
554; GCN32-NEXT:    v_mov_b32_e32 v3, 0
555; GCN32-NEXT:    v_readlane_b32 s5, v1, 15
556; GCN32-NEXT:    v_readlane_b32 s6, v1, 31
557; GCN32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
558; GCN32-NEXT:    s_mov_b32 exec_lo, s4
559; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
560; GCN32-NEXT:    s_or_saveexec_b32 s4, -1
561; GCN32-NEXT:    v_writelane_b32 v3, s5, 16
562; GCN32-NEXT:    s_mov_b32 exec_lo, s4
563; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
564; GCN32-NEXT:    s_mov_b32 s4, s6
565; GCN32-NEXT:    s_mov_b32 s6, -1
566; GCN32-NEXT:    ; implicit-def: $vgpr0
567; GCN32-NEXT:    s_and_saveexec_b32 s8, vcc_lo
568; GCN32-NEXT:    s_cbranch_execz BB2_2
569; GCN32-NEXT:  ; %bb.1:
570; GCN32-NEXT:    v_mov_b32_e32 v0, s4
571; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
572; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
573; GCN32-NEXT:    s_mov_b32 s4, s2
574; GCN32-NEXT:    s_mov_b32 s5, s3
575; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
576; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
577; GCN32-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
578; GCN32-NEXT:    s_waitcnt vmcnt(0)
579; GCN32-NEXT:    buffer_gl0_inv
580; GCN32-NEXT:    buffer_gl1_inv
581; GCN32-NEXT:  BB2_2:
582; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
583; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s8
584; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
585; GCN32-NEXT:    v_readfirstlane_b32 s2, v0
586; GCN32-NEXT:    v_mov_b32_e32 v0, v3
587; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
588; GCN32-NEXT:    v_add_nc_u32_e32 v0, s2, v0
589; GCN32-NEXT:    s_mov_b32 s2, s6
590; GCN32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
591; GCN32-NEXT:    s_endpgm
592entry:
593  %lane = call i32 @llvm.amdgcn.workitem.id.x()
594  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
595  store i32 %old, i32 addrspace(1)* %out
596  ret void
597}
598
599define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
600; GFX7LESS-LABEL: add_i64_constant:
601; GFX7LESS:       ; %bb.0: ; %entry
602; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
603; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
604; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
605; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
606; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
607; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
608; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
609; GFX7LESS-NEXT:    s_cbranch_execz BB3_2
610; GFX7LESS-NEXT:  ; %bb.1:
611; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
612; GFX7LESS-NEXT:    s_mov_b32 s10, -1
613; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
614; GFX7LESS-NEXT:    s_mov_b32 s8, s2
615; GFX7LESS-NEXT:    s_mov_b32 s9, s3
616; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
617; GFX7LESS-NEXT:    s_mul_i32 s3, s2, 5
618; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
619; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
620; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
621; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
622; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
623; GFX7LESS-NEXT:    buffer_wbinvl1
624; GFX7LESS-NEXT:  BB3_2:
625; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
626; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
628; GFX7LESS-NEXT:    s_mov_b32 s2, -1
629; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
630; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v2
631; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
632; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
633; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
634; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
635; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
636; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
637; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
638; GFX7LESS-NEXT:    s_endpgm
639;
640; GFX89-LABEL: add_i64_constant:
641; GFX89:       ; %bb.0: ; %entry
642; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
643; GFX89-NEXT:    s_mov_b64 s[6:7], exec
644; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
645; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
646; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
647; GFX89-NEXT:    ; implicit-def: $vgpr1_vgpr2
648; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
649; GFX89-NEXT:    s_cbranch_execz BB3_2
650; GFX89-NEXT:  ; %bb.1:
651; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX89-NEXT:    s_mov_b32 s8, s2
653; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
654; GFX89-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
655; GFX89-NEXT:    s_mul_i32 s2, s2, 5
656; GFX89-NEXT:    s_mov_b32 s11, 0xf000
657; GFX89-NEXT:    s_mov_b32 s10, -1
658; GFX89-NEXT:    s_mov_b32 s9, s3
659; GFX89-NEXT:    v_mov_b32_e32 v1, s2
660; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
661; GFX89-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
662; GFX89-NEXT:    s_waitcnt vmcnt(0)
663; GFX89-NEXT:    buffer_wbinvl1_vol
664; GFX89-NEXT:  BB3_2:
665; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
666; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX89-NEXT:    v_readfirstlane_b32 s2, v1
668; GFX89-NEXT:    v_readfirstlane_b32 s3, v2
669; GFX89-NEXT:    v_mov_b32_e32 v1, s2
670; GFX89-NEXT:    v_mov_b32_e32 v2, s3
671; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2]
672; GFX89-NEXT:    s_mov_b32 s3, 0xf000
673; GFX89-NEXT:    s_mov_b32 s2, -1
674; GFX89-NEXT:    s_nop 2
675; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
676; GFX89-NEXT:    s_endpgm
677;
678; GCN64-LABEL: add_i64_constant:
679; GCN64:       ; %bb.0: ; %entry
680; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
681; GCN64-NEXT:    s_mov_b64 s[6:7], exec
682; GCN64-NEXT:    ; implicit-def: $vgpr1_vgpr2
683; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
684; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
685; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
686; GCN64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
687; GCN64-NEXT:    s_cbranch_execz BB3_2
688; GCN64-NEXT:  ; %bb.1:
689; GCN64-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
690; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
691; GCN64-NEXT:    s_mul_i32 s7, s6, 5
692; GCN64-NEXT:    v_mul_hi_u32_u24_e64 v2, s6, 5
693; GCN64-NEXT:    v_mov_b32_e32 v1, s7
694; GCN64-NEXT:    s_mov_b32 s10, -1
695; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
696; GCN64-NEXT:    s_mov_b32 s8, s2
697; GCN64-NEXT:    s_mov_b32 s9, s3
698; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
699; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
700; GCN64-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
701; GCN64-NEXT:    s_waitcnt vmcnt(0)
702; GCN64-NEXT:    buffer_gl0_inv
703; GCN64-NEXT:    buffer_gl1_inv
704; GCN64-NEXT:  BB3_2:
705; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
706; GCN64-NEXT:    s_or_b64 exec, exec, s[4:5]
707; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
708; GCN64-NEXT:    v_readfirstlane_b32 s2, v1
709; GCN64-NEXT:    v_readfirstlane_b32 s3, v2
710; GCN64-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3]
711; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
712; GCN64-NEXT:    s_mov_b32 s2, -1
713; GCN64-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
714; GCN64-NEXT:    s_endpgm
715;
716; GCN32-LABEL: add_i64_constant:
717; GCN32:       ; %bb.0: ; %entry
718; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
719; GCN32-NEXT:    s_mov_b32 s5, exec_lo
720; GCN32-NEXT:    ; implicit-def: $vgpr1_vgpr2
721; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
722; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
723; GCN32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
724; GCN32-NEXT:    s_cbranch_execz BB3_2
725; GCN32-NEXT:  ; %bb.1:
726; GCN32-NEXT:    s_bcnt1_i32_b32 s5, s5
727; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
728; GCN32-NEXT:    s_mul_i32 s6, s5, 5
729; GCN32-NEXT:    v_mul_hi_u32_u24_e64 v2, s5, 5
730; GCN32-NEXT:    v_mov_b32_e32 v1, s6
731; GCN32-NEXT:    s_mov_b32 s10, -1
732; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
733; GCN32-NEXT:    s_mov_b32 s8, s2
734; GCN32-NEXT:    s_mov_b32 s9, s3
735; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
736; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
737; GCN32-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
738; GCN32-NEXT:    s_waitcnt vmcnt(0)
739; GCN32-NEXT:    buffer_gl0_inv
740; GCN32-NEXT:    buffer_gl1_inv
741; GCN32-NEXT:  BB3_2:
742; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
743; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
744; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
745; GCN32-NEXT:    v_readfirstlane_b32 s2, v1
746; GCN32-NEXT:    v_readfirstlane_b32 s3, v2
747; GCN32-NEXT:    v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3]
748; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
749; GCN32-NEXT:    s_mov_b32 s2, -1
750; GCN32-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
751; GCN32-NEXT:    s_endpgm
752entry:
753  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
754  store i64 %old, i64 addrspace(1)* %out
755  ret void
756}
757
758define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
759; GFX7LESS-LABEL: add_i64_uniform:
760; GFX7LESS:       ; %bb.0: ; %entry
761; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
762; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
763; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
764; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
765; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
766; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
767; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
768; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
769; GFX7LESS-NEXT:    s_cbranch_execz BB4_2
770; GFX7LESS-NEXT:  ; %bb.1:
771; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
772; GFX7LESS-NEXT:    s_mov_b32 s14, -1
773; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX7LESS-NEXT:    s_mov_b32 s12, s6
775; GFX7LESS-NEXT:    s_mov_b32 s13, s7
776; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
777; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
778; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
779; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v1
780; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
781; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
782; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
783; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
784; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
785; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
786; GFX7LESS-NEXT:    buffer_wbinvl1
787; GFX7LESS-NEXT:  BB4_2:
788; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
789; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
791; GFX7LESS-NEXT:    s_mov_b32 s6, -1
792; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
793; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v2
794; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
795; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s1, v0
796; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s0, v0
797; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
798; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
799; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
800; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
801; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
802; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
803; GFX7LESS-NEXT:    s_endpgm
804;
805; GFX8-LABEL: add_i64_uniform:
806; GFX8:       ; %bb.0: ; %entry
807; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
808; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
809; GFX8-NEXT:    s_mov_b64 s[8:9], exec
810; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
811; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
812; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
813; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
814; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
815; GFX8-NEXT:    s_cbranch_execz BB4_2
816; GFX8-NEXT:  ; %bb.1:
817; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
818; GFX8-NEXT:    s_mov_b32 s12, s6
819; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
820; GFX8-NEXT:    v_mov_b32_e32 v1, s6
821; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
822; GFX8-NEXT:    s_mov_b32 s13, s7
823; GFX8-NEXT:    s_mul_i32 s7, s1, s6
824; GFX8-NEXT:    s_mul_i32 s6, s0, s6
825; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
826; GFX8-NEXT:    s_mov_b32 s15, 0xf000
827; GFX8-NEXT:    s_mov_b32 s14, -1
828; GFX8-NEXT:    v_mov_b32_e32 v1, s6
829; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
830; GFX8-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
831; GFX8-NEXT:    s_waitcnt vmcnt(0)
832; GFX8-NEXT:    buffer_wbinvl1_vol
833; GFX8-NEXT:  BB4_2:
834; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
835; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
836; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
837; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
838; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v0
839; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
840; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
841; GFX8-NEXT:    v_mov_b32_e32 v2, s1
842; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
843; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
844; GFX8-NEXT:    s_mov_b32 s7, 0xf000
845; GFX8-NEXT:    s_mov_b32 s6, -1
846; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
847; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
848; GFX8-NEXT:    s_endpgm
849;
850; GFX9-LABEL: add_i64_uniform:
851; GFX9:       ; %bb.0: ; %entry
852; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
853; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
854; GFX9-NEXT:    s_mov_b64 s[8:9], exec
855; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
856; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
857; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
858; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
859; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
860; GFX9-NEXT:    s_cbranch_execz BB4_2
861; GFX9-NEXT:  ; %bb.1:
862; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
863; GFX9-NEXT:    s_mov_b32 s12, s6
864; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
865; GFX9-NEXT:    s_mov_b32 s13, s7
866; GFX9-NEXT:    s_mul_i32 s7, s3, s6
867; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
868; GFX9-NEXT:    s_add_i32 s8, s8, s7
869; GFX9-NEXT:    s_mul_i32 s6, s2, s6
870; GFX9-NEXT:    s_mov_b32 s15, 0xf000
871; GFX9-NEXT:    s_mov_b32 s14, -1
872; GFX9-NEXT:    v_mov_b32_e32 v1, s6
873; GFX9-NEXT:    v_mov_b32_e32 v2, s8
874; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
875; GFX9-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc
876; GFX9-NEXT:    s_waitcnt vmcnt(0)
877; GFX9-NEXT:    buffer_wbinvl1_vol
878; GFX9-NEXT:  BB4_2:
879; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
880; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
881; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
882; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
883; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
884; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
885; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
886; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
887; GFX9-NEXT:    v_mov_b32_e32 v2, s1
888; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
889; GFX9-NEXT:    s_mov_b32 s7, 0xf000
890; GFX9-NEXT:    s_mov_b32 s6, -1
891; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v1, vcc
892; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
893; GFX9-NEXT:    s_endpgm
894;
895; GCN64-LABEL: add_i64_uniform:
896; GCN64:       ; %bb.0: ; %entry
897; GCN64-NEXT:    s_clause 0x1
898; GCN64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
899; GCN64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
900; GCN64-NEXT:    s_mov_b64 s[8:9], exec
901; GCN64-NEXT:    ; implicit-def: $vgpr1_vgpr2
902; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
903; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s9, v0
904; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
905; GCN64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
906; GCN64-NEXT:    s_cbranch_execz BB4_2
907; GCN64-NEXT:  ; %bb.1:
908; GCN64-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
909; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
910; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
911; GCN64-NEXT:    s_mul_i32 s9, s3, s8
912; GCN64-NEXT:    s_mul_hi_u32 s10, s2, s8
913; GCN64-NEXT:    s_mul_i32 s8, s2, s8
914; GCN64-NEXT:    s_add_i32 s10, s10, s9
915; GCN64-NEXT:    v_mov_b32_e32 v1, s8
916; GCN64-NEXT:    v_mov_b32_e32 v2, s10
917; GCN64-NEXT:    s_mov_b32 s10, -1
918; GCN64-NEXT:    s_mov_b32 s8, s6
919; GCN64-NEXT:    s_mov_b32 s9, s7
920; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
921; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
922; GCN64-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
923; GCN64-NEXT:    s_waitcnt vmcnt(0)
924; GCN64-NEXT:    buffer_gl0_inv
925; GCN64-NEXT:    buffer_gl1_inv
926; GCN64-NEXT:  BB4_2:
927; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
928; GCN64-NEXT:    s_or_b64 exec, exec, s[0:1]
929; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
930; GCN64-NEXT:    v_mul_lo_u32 v3, s3, v0
931; GCN64-NEXT:    v_mul_hi_u32 v4, s2, v0
932; GCN64-NEXT:    v_mul_lo_u32 v0, s2, v0
933; GCN64-NEXT:    v_readfirstlane_b32 s0, v1
934; GCN64-NEXT:    v_readfirstlane_b32 s1, v2
935; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
936; GCN64-NEXT:    s_mov_b32 s6, -1
937; GCN64-NEXT:    v_add_nc_u32_e32 v1, v4, v3
938; GCN64-NEXT:    v_add_co_u32_e64 v0, vcc, s0, v0
939; GCN64-NEXT:    v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
940; GCN64-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
941; GCN64-NEXT:    s_endpgm
942;
943; GCN32-LABEL: add_i64_uniform:
944; GCN32:       ; %bb.0: ; %entry
945; GCN32-NEXT:    s_clause 0x1
946; GCN32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
947; GCN32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
948; GCN32-NEXT:    s_mov_b32 s8, exec_lo
949; GCN32-NEXT:    ; implicit-def: $vgpr1_vgpr2
950; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
951; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
952; GCN32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
953; GCN32-NEXT:    s_cbranch_execz BB4_2
954; GCN32-NEXT:  ; %bb.1:
955; GCN32-NEXT:    s_bcnt1_i32_b32 s1, s8
956; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
957; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
958; GCN32-NEXT:    s_mul_i32 s8, s3, s1
959; GCN32-NEXT:    s_mul_hi_u32 s9, s2, s1
960; GCN32-NEXT:    s_mul_i32 s1, s2, s1
961; GCN32-NEXT:    s_add_i32 s9, s9, s8
962; GCN32-NEXT:    v_mov_b32_e32 v1, s1
963; GCN32-NEXT:    v_mov_b32_e32 v2, s9
964; GCN32-NEXT:    s_mov_b32 s10, -1
965; GCN32-NEXT:    s_mov_b32 s8, s6
966; GCN32-NEXT:    s_mov_b32 s9, s7
967; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
968; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
969; GCN32-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc
970; GCN32-NEXT:    s_waitcnt vmcnt(0)
971; GCN32-NEXT:    buffer_gl0_inv
972; GCN32-NEXT:    buffer_gl1_inv
973; GCN32-NEXT:  BB4_2:
974; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
975; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
976; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
977; GCN32-NEXT:    v_mul_lo_u32 v3, s3, v0
978; GCN32-NEXT:    v_mul_hi_u32 v4, s2, v0
979; GCN32-NEXT:    v_mul_lo_u32 v0, s2, v0
980; GCN32-NEXT:    v_readfirstlane_b32 s0, v1
981; GCN32-NEXT:    v_readfirstlane_b32 s1, v2
982; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
983; GCN32-NEXT:    s_mov_b32 s6, -1
984; GCN32-NEXT:    v_add_nc_u32_e32 v1, v4, v3
985; GCN32-NEXT:    v_add_co_u32_e64 v0, vcc_lo, s0, v0
986; GCN32-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
987; GCN32-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
988; GCN32-NEXT:    s_endpgm
989entry:
990  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
991  store i64 %old, i64 addrspace(1)* %out
992  ret void
993}
994
995define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
996; GFX7LESS-LABEL: add_i64_varying:
997; GFX7LESS:       ; %bb.0: ; %entry
998; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
999; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1000; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1001; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1002; GFX7LESS-NEXT:    s_mov_b32 s10, s6
1003; GFX7LESS-NEXT:    s_mov_b32 s11, s7
1004; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1005; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1006; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1007; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1008; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1009; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1010; GFX7LESS-NEXT:    buffer_wbinvl1
1011; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1012; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1013; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1014; GFX7LESS-NEXT:    s_endpgm
1015;
1016; GFX89-LABEL: add_i64_varying:
1017; GFX89:       ; %bb.0: ; %entry
1018; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1019; GFX89-NEXT:    s_mov_b32 s3, 0xf000
1020; GFX89-NEXT:    s_mov_b32 s2, -1
1021; GFX89-NEXT:    v_mov_b32_e32 v1, 0
1022; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX89-NEXT:    s_mov_b32 s0, s4
1024; GFX89-NEXT:    s_mov_b32 s1, s5
1025; GFX89-NEXT:    s_mov_b32 s4, s6
1026; GFX89-NEXT:    s_mov_b32 s5, s7
1027; GFX89-NEXT:    s_mov_b32 s6, s2
1028; GFX89-NEXT:    s_mov_b32 s7, s3
1029; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1030; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc
1031; GFX89-NEXT:    s_waitcnt vmcnt(0)
1032; GFX89-NEXT:    buffer_wbinvl1_vol
1033; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1034; GFX89-NEXT:    s_endpgm
1035;
1036; GFX10-LABEL: add_i64_varying:
1037; GFX10:       ; %bb.0: ; %entry
1038; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1039; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1040; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1041; GFX10-NEXT:    s_mov_b32 s6, -1
1042; GFX10-NEXT:    s_mov_b32 s11, s7
1043; GFX10-NEXT:    s_mov_b32 s10, s6
1044; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX10-NEXT:    s_mov_b32 s8, s2
1046; GFX10-NEXT:    s_mov_b32 s9, s3
1047; GFX10-NEXT:    s_mov_b32 s4, s0
1048; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1049; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1050; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1051; GFX10-NEXT:    s_waitcnt vmcnt(0)
1052; GFX10-NEXT:    buffer_gl0_inv
1053; GFX10-NEXT:    buffer_gl1_inv
1054; GFX10-NEXT:    s_mov_b32 s5, s1
1055; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1056; GFX10-NEXT:    s_endpgm
1057entry:
1058  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1059  %zext = zext i32 %lane to i64
1060  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
1061  store i64 %old, i64 addrspace(1)* %out
1062  ret void
1063}
1064
1065define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
1066; GFX7LESS-LABEL: sub_i32_constant:
1067; GFX7LESS:       ; %bb.0: ; %entry
1068; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1069; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1070; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1071; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1072; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1073; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1074; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1075; GFX7LESS-NEXT:    s_cbranch_execz BB6_2
1076; GFX7LESS-NEXT:  ; %bb.1:
1077; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1078; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1079; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
1080; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1081; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1083; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1084; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1085; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1086; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1087; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1088; GFX7LESS-NEXT:    buffer_wbinvl1
1089; GFX7LESS-NEXT:  BB6_2:
1090; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1091; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1093; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1094; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1095; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1096; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1097; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1098; GFX7LESS-NEXT:    s_endpgm
1099;
1100; GFX8-LABEL: sub_i32_constant:
1101; GFX8:       ; %bb.0: ; %entry
1102; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1103; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1104; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1105; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1106; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1107; GFX8-NEXT:    ; implicit-def: $vgpr1
1108; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1109; GFX8-NEXT:    s_cbranch_execz BB6_2
1110; GFX8-NEXT:  ; %bb.1:
1111; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1112; GFX8-NEXT:    s_mov_b32 s8, s2
1113; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1114; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1115; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1116; GFX8-NEXT:    s_mov_b32 s10, -1
1117; GFX8-NEXT:    s_mov_b32 s9, s3
1118; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1119; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1120; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1121; GFX8-NEXT:    s_waitcnt vmcnt(0)
1122; GFX8-NEXT:    buffer_wbinvl1_vol
1123; GFX8-NEXT:  BB6_2:
1124; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1125; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
1126; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1127; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1128; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1129; GFX8-NEXT:    s_mov_b32 s2, -1
1130; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1131; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1132; GFX8-NEXT:    s_endpgm
1133;
1134; GFX9-LABEL: sub_i32_constant:
1135; GFX9:       ; %bb.0: ; %entry
1136; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1137; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1138; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1139; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1140; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1141; GFX9-NEXT:    ; implicit-def: $vgpr1
1142; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1143; GFX9-NEXT:    s_cbranch_execz BB6_2
1144; GFX9-NEXT:  ; %bb.1:
1145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1146; GFX9-NEXT:    s_mov_b32 s8, s2
1147; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1148; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1149; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1150; GFX9-NEXT:    s_mov_b32 s10, -1
1151; GFX9-NEXT:    s_mov_b32 s9, s3
1152; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1153; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1154; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1155; GFX9-NEXT:    s_waitcnt vmcnt(0)
1156; GFX9-NEXT:    buffer_wbinvl1_vol
1157; GFX9-NEXT:  BB6_2:
1158; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1159; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1160; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1161; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1162; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1163; GFX9-NEXT:    s_mov_b32 s2, -1
1164; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1165; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1166; GFX9-NEXT:    s_endpgm
1167;
1168; GCN64-LABEL: sub_i32_constant:
1169; GCN64:       ; %bb.0: ; %entry
1170; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1171; GCN64-NEXT:    s_mov_b64 s[6:7], exec
1172; GCN64-NEXT:    ; implicit-def: $vgpr1
1173; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1174; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1175; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1176; GCN64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1177; GCN64-NEXT:    s_cbranch_execz BB6_2
1178; GCN64-NEXT:  ; %bb.1:
1179; GCN64-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1180; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
1181; GCN64-NEXT:    s_mul_i32 s6, s6, 5
1182; GCN64-NEXT:    s_mov_b32 s10, -1
1183; GCN64-NEXT:    v_mov_b32_e32 v1, s6
1184; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1185; GCN64-NEXT:    s_mov_b32 s8, s2
1186; GCN64-NEXT:    s_mov_b32 s9, s3
1187; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1188; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
1189; GCN64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1190; GCN64-NEXT:    s_waitcnt vmcnt(0)
1191; GCN64-NEXT:    buffer_gl0_inv
1192; GCN64-NEXT:    buffer_gl1_inv
1193; GCN64-NEXT:  BB6_2:
1194; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
1195; GCN64-NEXT:    s_or_b64 exec, exec, s[4:5]
1196; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1197; GCN64-NEXT:    v_readfirstlane_b32 s2, v1
1198; GCN64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1199; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
1200; GCN64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1201; GCN64-NEXT:    s_mov_b32 s2, -1
1202; GCN64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1203; GCN64-NEXT:    s_endpgm
1204;
1205; GCN32-LABEL: sub_i32_constant:
1206; GCN32:       ; %bb.0: ; %entry
1207; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1208; GCN32-NEXT:    s_mov_b32 s5, exec_lo
1209; GCN32-NEXT:    ; implicit-def: $vgpr1
1210; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1211; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1212; GCN32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1213; GCN32-NEXT:    s_cbranch_execz BB6_2
1214; GCN32-NEXT:  ; %bb.1:
1215; GCN32-NEXT:    s_bcnt1_i32_b32 s5, s5
1216; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
1217; GCN32-NEXT:    s_mul_i32 s5, s5, 5
1218; GCN32-NEXT:    s_mov_b32 s10, -1
1219; GCN32-NEXT:    v_mov_b32_e32 v1, s5
1220; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1221; GCN32-NEXT:    s_mov_b32 s8, s2
1222; GCN32-NEXT:    s_mov_b32 s9, s3
1223; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1224; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
1225; GCN32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1226; GCN32-NEXT:    s_waitcnt vmcnt(0)
1227; GCN32-NEXT:    buffer_gl0_inv
1228; GCN32-NEXT:    buffer_gl1_inv
1229; GCN32-NEXT:  BB6_2:
1230; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
1231; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1232; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1233; GCN32-NEXT:    v_readfirstlane_b32 s2, v1
1234; GCN32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1235; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
1236; GCN32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1237; GCN32-NEXT:    s_mov_b32 s2, -1
1238; GCN32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1239; GCN32-NEXT:    s_endpgm
1240entry:
1241  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
1242  store i32 %old, i32 addrspace(1)* %out
1243  ret void
1244}
1245
1246define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
1247; GFX7LESS-LABEL: sub_i32_uniform:
1248; GFX7LESS:       ; %bb.0: ; %entry
1249; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1250; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1251; GFX7LESS-NEXT:    s_load_dword s0, s[0:1], 0xd
1252; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1253; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1254; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1255; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1256; GFX7LESS-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1257; GFX7LESS-NEXT:    s_cbranch_execz BB7_2
1258; GFX7LESS-NEXT:  ; %bb.1:
1259; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1260; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1261; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1262; GFX7LESS-NEXT:    s_mul_i32 s1, s0, s1
1263; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1264; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1265; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1266; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s1
1267; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1268; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1269; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1270; GFX7LESS-NEXT:    buffer_wbinvl1
1271; GFX7LESS-NEXT:  BB7_2:
1272; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[8:9]
1273; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1274; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1275; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1276; GFX7LESS-NEXT:    v_readfirstlane_b32 s1, v1
1277; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1278; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s1, v0
1279; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1280; GFX7LESS-NEXT:    s_endpgm
1281;
1282; GFX8-LABEL: sub_i32_uniform:
1283; GFX8:       ; %bb.0: ; %entry
1284; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1285; GFX8-NEXT:    s_load_dword s0, s[0:1], 0x34
1286; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1287; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1288; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1289; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1290; GFX8-NEXT:    ; implicit-def: $vgpr1
1291; GFX8-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1292; GFX8-NEXT:    s_cbranch_execz BB7_2
1293; GFX8-NEXT:  ; %bb.1:
1294; GFX8-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
1295; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1296; GFX8-NEXT:    s_mul_i32 s1, s0, s1
1297; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1298; GFX8-NEXT:    s_mov_b32 s14, -1
1299; GFX8-NEXT:    s_mov_b32 s12, s6
1300; GFX8-NEXT:    s_mov_b32 s13, s7
1301; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1302; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1303; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1304; GFX8-NEXT:    s_waitcnt vmcnt(0)
1305; GFX8-NEXT:    buffer_wbinvl1_vol
1306; GFX8-NEXT:  BB7_2:
1307; GFX8-NEXT:    s_or_b64 exec, exec, s[8:9]
1308; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1309; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1310; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1311; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1312; GFX8-NEXT:    s_mov_b32 s6, -1
1313; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1314; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1315; GFX8-NEXT:    s_endpgm
1316;
1317; GFX9-LABEL: sub_i32_uniform:
1318; GFX9:       ; %bb.0: ; %entry
1319; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1320; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
1321; GFX9-NEXT:    s_mov_b64 s[8:9], exec
1322; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1323; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
1324; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1325; GFX9-NEXT:    ; implicit-def: $vgpr1
1326; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1327; GFX9-NEXT:    s_cbranch_execz BB7_2
1328; GFX9-NEXT:  ; %bb.1:
1329; GFX9-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
1330; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX9-NEXT:    s_mul_i32 s3, s2, s3
1332; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1333; GFX9-NEXT:    s_mov_b32 s14, -1
1334; GFX9-NEXT:    s_mov_b32 s12, s6
1335; GFX9-NEXT:    s_mov_b32 s13, s7
1336; GFX9-NEXT:    v_mov_b32_e32 v1, s3
1337; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1338; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1339; GFX9-NEXT:    s_waitcnt vmcnt(0)
1340; GFX9-NEXT:    buffer_wbinvl1_vol
1341; GFX9-NEXT:  BB7_2:
1342; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
1345; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1346; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1347; GFX9-NEXT:    s_mov_b32 s6, -1
1348; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1349; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1350; GFX9-NEXT:    s_endpgm
1351;
1352; GCN64-LABEL: sub_i32_uniform:
1353; GCN64:       ; %bb.0: ; %entry
1354; GCN64-NEXT:    s_clause 0x1
1355; GCN64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1356; GCN64-NEXT:    s_load_dword s2, s[0:1], 0x34
1357; GCN64-NEXT:    s_mov_b64 s[8:9], exec
1358; GCN64-NEXT:    ; implicit-def: $vgpr1
1359; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
1360; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s9, v0
1361; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1362; GCN64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1363; GCN64-NEXT:    s_cbranch_execz BB7_2
1364; GCN64-NEXT:  ; %bb.1:
1365; GCN64-NEXT:    s_bcnt1_i32_b64 s3, s[8:9]
1366; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
1367; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1368; GCN64-NEXT:    s_mul_i32 s3, s2, s3
1369; GCN64-NEXT:    s_mov_b32 s10, -1
1370; GCN64-NEXT:    v_mov_b32_e32 v1, s3
1371; GCN64-NEXT:    s_mov_b32 s8, s6
1372; GCN64-NEXT:    s_mov_b32 s9, s7
1373; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1374; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
1375; GCN64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1376; GCN64-NEXT:    s_waitcnt vmcnt(0)
1377; GCN64-NEXT:    buffer_gl0_inv
1378; GCN64-NEXT:    buffer_gl1_inv
1379; GCN64-NEXT:  BB7_2:
1380; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
1381; GCN64-NEXT:    s_or_b64 exec, exec, s[0:1]
1382; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1383; GCN64-NEXT:    v_mul_lo_u32 v0, s2, v0
1384; GCN64-NEXT:    v_readfirstlane_b32 s0, v1
1385; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
1386; GCN64-NEXT:    s_mov_b32 s6, -1
1387; GCN64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1388; GCN64-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1389; GCN64-NEXT:    s_endpgm
1390;
1391; GCN32-LABEL: sub_i32_uniform:
1392; GCN32:       ; %bb.0: ; %entry
1393; GCN32-NEXT:    s_clause 0x1
1394; GCN32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1395; GCN32-NEXT:    s_load_dword s2, s[0:1], 0x34
1396; GCN32-NEXT:    s_mov_b32 s3, exec_lo
1397; GCN32-NEXT:    ; implicit-def: $vgpr1
1398; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s3, 0
1399; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1400; GCN32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1401; GCN32-NEXT:    s_cbranch_execz BB7_2
1402; GCN32-NEXT:  ; %bb.1:
1403; GCN32-NEXT:    s_bcnt1_i32_b32 s1, s3
1404; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
1405; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1406; GCN32-NEXT:    s_mul_i32 s1, s2, s1
1407; GCN32-NEXT:    s_mov_b32 s10, -1
1408; GCN32-NEXT:    v_mov_b32_e32 v1, s1
1409; GCN32-NEXT:    s_mov_b32 s8, s6
1410; GCN32-NEXT:    s_mov_b32 s9, s7
1411; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1412; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
1413; GCN32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1414; GCN32-NEXT:    s_waitcnt vmcnt(0)
1415; GCN32-NEXT:    buffer_gl0_inv
1416; GCN32-NEXT:    buffer_gl1_inv
1417; GCN32-NEXT:  BB7_2:
1418; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
1419; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1420; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1421; GCN32-NEXT:    v_mul_lo_u32 v0, s2, v0
1422; GCN32-NEXT:    v_readfirstlane_b32 s0, v1
1423; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
1424; GCN32-NEXT:    s_mov_b32 s6, -1
1425; GCN32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1426; GCN32-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1427; GCN32-NEXT:    s_endpgm
1428entry:
1429  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
1430  store i32 %old, i32 addrspace(1)* %out
1431  ret void
1432}
1433
1434define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
1435; GFX7LESS-LABEL: sub_i32_varying:
1436; GFX7LESS:       ; %bb.0: ; %entry
1437; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1438; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1439; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1440; GFX7LESS-NEXT:    s_mov_b32 s10, s6
1441; GFX7LESS-NEXT:    s_mov_b32 s11, s7
1442; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1443; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1444; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1445; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1446; GFX7LESS-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1447; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1448; GFX7LESS-NEXT:    buffer_wbinvl1
1449; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1450; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1451; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1452; GFX7LESS-NEXT:    s_endpgm
1453;
1454; GFX8-LABEL: sub_i32_varying:
1455; GFX8:       ; %bb.0: ; %entry
1456; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1457; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1458; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1459; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1460; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1461; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1462; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1463; GFX8-NEXT:    s_not_b64 exec, exec
1464; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1465; GFX8-NEXT:    s_not_b64 exec, exec
1466; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1467; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1468; GFX8-NEXT:    s_nop 1
1469; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1470; GFX8-NEXT:    s_nop 1
1471; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1472; GFX8-NEXT:    s_nop 1
1473; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1474; GFX8-NEXT:    s_nop 1
1475; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1476; GFX8-NEXT:    s_nop 1
1477; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1478; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1479; GFX8-NEXT:    s_nop 0
1480; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1481; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1482; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1483; GFX8-NEXT:    ; implicit-def: $vgpr0
1484; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1485; GFX8-NEXT:    s_cbranch_execz BB8_2
1486; GFX8-NEXT:  ; %bb.1:
1487; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1488; GFX8-NEXT:    s_mov_b32 s10, -1
1489; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1490; GFX8-NEXT:    s_mov_b32 s8, s2
1491; GFX8-NEXT:    s_mov_b32 s9, s3
1492; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1493; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1494; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1495; GFX8-NEXT:    s_waitcnt vmcnt(0)
1496; GFX8-NEXT:    buffer_wbinvl1_vol
1497; GFX8-NEXT:  BB8_2:
1498; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1499; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
1500; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1501; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1502; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1503; GFX8-NEXT:    s_mov_b32 s2, -1
1504; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1505; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1506; GFX8-NEXT:    s_endpgm
1507;
1508; GFX9-LABEL: sub_i32_varying:
1509; GFX9:       ; %bb.0: ; %entry
1510; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1511; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1512; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1513; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1514; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1515; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1516; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1517; GFX9-NEXT:    s_not_b64 exec, exec
1518; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1519; GFX9-NEXT:    s_not_b64 exec, exec
1520; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1521; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1522; GFX9-NEXT:    s_nop 1
1523; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1524; GFX9-NEXT:    s_nop 1
1525; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1526; GFX9-NEXT:    s_nop 1
1527; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1528; GFX9-NEXT:    s_nop 1
1529; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1530; GFX9-NEXT:    s_nop 1
1531; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1532; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1533; GFX9-NEXT:    s_nop 0
1534; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1535; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1536; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1537; GFX9-NEXT:    ; implicit-def: $vgpr0
1538; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1539; GFX9-NEXT:    s_cbranch_execz BB8_2
1540; GFX9-NEXT:  ; %bb.1:
1541; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1542; GFX9-NEXT:    s_mov_b32 s10, -1
1543; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1544; GFX9-NEXT:    s_mov_b32 s8, s2
1545; GFX9-NEXT:    s_mov_b32 s9, s3
1546; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1547; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1548; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1549; GFX9-NEXT:    s_waitcnt vmcnt(0)
1550; GFX9-NEXT:    buffer_wbinvl1_vol
1551; GFX9-NEXT:  BB8_2:
1552; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1553; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
1554; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1555; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1556; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1557; GFX9-NEXT:    s_mov_b32 s2, -1
1558; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1559; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1560; GFX9-NEXT:    s_endpgm
1561;
1562; GCN64-LABEL: sub_i32_varying:
1563; GCN64:       ; %bb.0: ; %entry
1564; GCN64-NEXT:    v_mov_b32_e32 v1, v0
1565; GCN64-NEXT:    s_not_b64 exec, exec
1566; GCN64-NEXT:    v_mov_b32_e32 v1, 0
1567; GCN64-NEXT:    s_not_b64 exec, exec
1568; GCN64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1569; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1570; GCN64-NEXT:    v_mov_b32_e32 v3, 0
1571; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1572; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1573; GCN64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1574; GCN64-NEXT:    v_mov_b32_e32 v2, v1
1575; GCN64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1576; GCN64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1577; GCN64-NEXT:    v_readlane_b32 s4, v1, 31
1578; GCN64-NEXT:    v_mov_b32_e32 v2, s4
1579; GCN64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1580; GCN64-NEXT:    v_readlane_b32 s6, v1, 15
1581; GCN64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1582; GCN64-NEXT:    s_mov_b64 exec, s[2:3]
1583; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1584; GCN64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1585; GCN64-NEXT:    v_readlane_b32 s7, v1, 31
1586; GCN64-NEXT:    v_writelane_b32 v3, s6, 16
1587; GCN64-NEXT:    s_mov_b64 exec, s[4:5]
1588; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1589; GCN64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1590; GCN64-NEXT:    v_readlane_b32 s8, v1, 47
1591; GCN64-NEXT:    v_readlane_b32 s9, v1, 63
1592; GCN64-NEXT:    v_writelane_b32 v3, s7, 32
1593; GCN64-NEXT:    s_mov_b64 exec, s[4:5]
1594; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0
1595; GCN64-NEXT:    s_or_saveexec_b64 s[6:7], -1
1596; GCN64-NEXT:    s_mov_b32 s4, s9
1597; GCN64-NEXT:    v_writelane_b32 v3, s8, 48
1598; GCN64-NEXT:    s_mov_b64 exec, s[6:7]
1599; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1600; GCN64-NEXT:    s_mov_b32 s6, -1
1601; GCN64-NEXT:    ; implicit-def: $vgpr0
1602; GCN64-NEXT:    s_and_saveexec_b64 s[8:9], vcc
1603; GCN64-NEXT:    s_cbranch_execz BB8_2
1604; GCN64-NEXT:  ; %bb.1:
1605; GCN64-NEXT:    v_mov_b32_e32 v0, s4
1606; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
1607; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1608; GCN64-NEXT:    s_mov_b32 s4, s2
1609; GCN64-NEXT:    s_mov_b32 s5, s3
1610; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1611; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
1612; GCN64-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1613; GCN64-NEXT:    s_waitcnt vmcnt(0)
1614; GCN64-NEXT:    buffer_gl0_inv
1615; GCN64-NEXT:    buffer_gl1_inv
1616; GCN64-NEXT:  BB8_2:
1617; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
1618; GCN64-NEXT:    s_or_b64 exec, exec, s[8:9]
1619; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1620; GCN64-NEXT:    v_readfirstlane_b32 s2, v0
1621; GCN64-NEXT:    v_mov_b32_e32 v0, v3
1622; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
1623; GCN64-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1624; GCN64-NEXT:    s_mov_b32 s2, s6
1625; GCN64-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1626; GCN64-NEXT:    s_endpgm
1627;
1628; GCN32-LABEL: sub_i32_varying:
1629; GCN32:       ; %bb.0: ; %entry
1630; GCN32-NEXT:    v_mov_b32_e32 v1, v0
1631; GCN32-NEXT:    s_not_b32 exec_lo, exec_lo
1632; GCN32-NEXT:    v_mov_b32_e32 v1, 0
1633; GCN32-NEXT:    s_not_b32 exec_lo, exec_lo
1634; GCN32-NEXT:    s_or_saveexec_b32 s2, -1
1635; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1636; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1637; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1638; GCN32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1639; GCN32-NEXT:    v_mov_b32_e32 v2, v1
1640; GCN32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1641; GCN32-NEXT:    s_mov_b32 exec_lo, s2
1642; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1643; GCN32-NEXT:    s_or_saveexec_b32 s4, -1
1644; GCN32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1645; GCN32-NEXT:    v_mov_b32_e32 v3, 0
1646; GCN32-NEXT:    v_readlane_b32 s5, v1, 15
1647; GCN32-NEXT:    v_readlane_b32 s6, v1, 31
1648; GCN32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1649; GCN32-NEXT:    s_mov_b32 exec_lo, s4
1650; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1651; GCN32-NEXT:    s_or_saveexec_b32 s4, -1
1652; GCN32-NEXT:    v_writelane_b32 v3, s5, 16
1653; GCN32-NEXT:    s_mov_b32 exec_lo, s4
1654; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1655; GCN32-NEXT:    s_mov_b32 s4, s6
1656; GCN32-NEXT:    s_mov_b32 s6, -1
1657; GCN32-NEXT:    ; implicit-def: $vgpr0
1658; GCN32-NEXT:    s_and_saveexec_b32 s8, vcc_lo
1659; GCN32-NEXT:    s_cbranch_execz BB8_2
1660; GCN32-NEXT:  ; %bb.1:
1661; GCN32-NEXT:    v_mov_b32_e32 v0, s4
1662; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
1663; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1664; GCN32-NEXT:    s_mov_b32 s4, s2
1665; GCN32-NEXT:    s_mov_b32 s5, s3
1666; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1667; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
1668; GCN32-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1669; GCN32-NEXT:    s_waitcnt vmcnt(0)
1670; GCN32-NEXT:    buffer_gl0_inv
1671; GCN32-NEXT:    buffer_gl1_inv
1672; GCN32-NEXT:  BB8_2:
1673; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
1674; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s8
1675; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1676; GCN32-NEXT:    v_readfirstlane_b32 s2, v0
1677; GCN32-NEXT:    v_mov_b32_e32 v0, v3
1678; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
1679; GCN32-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1680; GCN32-NEXT:    s_mov_b32 s2, s6
1681; GCN32-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1682; GCN32-NEXT:    s_endpgm
1683entry:
1684  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1685  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
1686  store i32 %old, i32 addrspace(1)* %out
1687  ret void
1688}
1689
1690define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
1691; GFX7LESS-LABEL: sub_i64_constant:
1692; GFX7LESS:       ; %bb.0: ; %entry
1693; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1694; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1695; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1696; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1697; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1698; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1699; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1700; GFX7LESS-NEXT:    s_cbranch_execz BB9_2
1701; GFX7LESS-NEXT:  ; %bb.1:
1702; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1703; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1704; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1705; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1706; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1707; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1708; GFX7LESS-NEXT:    s_mul_i32 s3, s2, 5
1709; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
1710; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s3
1711; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1712; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
1713; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1714; GFX7LESS-NEXT:    buffer_wbinvl1
1715; GFX7LESS-NEXT:  BB9_2:
1716; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1717; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1718; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1719; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1720; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1721; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v2
1722; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1723; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1724; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1725; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
1726; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1727; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1728; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1729; GFX7LESS-NEXT:    s_endpgm
1730;
1731; GFX8-LABEL: sub_i64_constant:
1732; GFX8:       ; %bb.0: ; %entry
1733; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1734; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1735; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1736; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1737; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1738; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1739; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1740; GFX8-NEXT:    s_cbranch_execz BB9_2
1741; GFX8-NEXT:  ; %bb.1:
1742; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1743; GFX8-NEXT:    s_mov_b32 s8, s2
1744; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1745; GFX8-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
1746; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1747; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1748; GFX8-NEXT:    s_mov_b32 s10, -1
1749; GFX8-NEXT:    s_mov_b32 s9, s3
1750; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1751; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1752; GFX8-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
1753; GFX8-NEXT:    s_waitcnt vmcnt(0)
1754; GFX8-NEXT:    buffer_wbinvl1_vol
1755; GFX8-NEXT:  BB9_2:
1756; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1757; GFX8-NEXT:    v_readfirstlane_b32 s5, v2
1758; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
1759; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1760; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1761; GFX8-NEXT:    v_mov_b32_e32 v2, s5
1762; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1763; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1764; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1765; GFX8-NEXT:    s_mov_b32 s2, -1
1766; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1767; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1768; GFX8-NEXT:    s_endpgm
1769;
1770; GFX9-LABEL: sub_i64_constant:
1771; GFX9:       ; %bb.0: ; %entry
1772; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1773; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1774; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1775; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1776; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1777; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1778; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1779; GFX9-NEXT:    s_cbranch_execz BB9_2
1780; GFX9-NEXT:  ; %bb.1:
1781; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1782; GFX9-NEXT:    s_mov_b32 s8, s2
1783; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1784; GFX9-NEXT:    v_mul_hi_u32_u24_e64 v2, s2, 5
1785; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1786; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1787; GFX9-NEXT:    s_mov_b32 s10, -1
1788; GFX9-NEXT:    s_mov_b32 s9, s3
1789; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1790; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1791; GFX9-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
1792; GFX9-NEXT:    s_waitcnt vmcnt(0)
1793; GFX9-NEXT:    buffer_wbinvl1_vol
1794; GFX9-NEXT:  BB9_2:
1795; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1796; GFX9-NEXT:    v_readfirstlane_b32 s5, v2
1797; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1798; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v0
1799; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1800; GFX9-NEXT:    v_mov_b32_e32 v2, s5
1801; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
1802; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1803; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1804; GFX9-NEXT:    s_mov_b32 s2, -1
1805; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
1806; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1807; GFX9-NEXT:    s_endpgm
1808;
1809; GCN64-LABEL: sub_i64_constant:
1810; GCN64:       ; %bb.0: ; %entry
1811; GCN64-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1812; GCN64-NEXT:    s_mov_b64 s[6:7], exec
1813; GCN64-NEXT:    ; implicit-def: $vgpr1_vgpr2
1814; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1815; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s7, v0
1816; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1817; GCN64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1818; GCN64-NEXT:    s_cbranch_execz BB9_2
1819; GCN64-NEXT:  ; %bb.1:
1820; GCN64-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1821; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
1822; GCN64-NEXT:    s_mul_i32 s7, s6, 5
1823; GCN64-NEXT:    v_mul_hi_u32_u24_e64 v2, s6, 5
1824; GCN64-NEXT:    v_mov_b32_e32 v1, s7
1825; GCN64-NEXT:    s_mov_b32 s10, -1
1826; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1827; GCN64-NEXT:    s_mov_b32 s8, s2
1828; GCN64-NEXT:    s_mov_b32 s9, s3
1829; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1830; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
1831; GCN64-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
1832; GCN64-NEXT:    s_waitcnt vmcnt(0)
1833; GCN64-NEXT:    buffer_gl0_inv
1834; GCN64-NEXT:    buffer_gl1_inv
1835; GCN64-NEXT:  BB9_2:
1836; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
1837; GCN64-NEXT:    s_or_b64 exec, exec, s[4:5]
1838; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
1839; GCN64-NEXT:    v_readfirstlane_b32 s2, v1
1840; GCN64-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
1841; GCN64-NEXT:    v_readfirstlane_b32 s3, v2
1842; GCN64-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
1843; GCN64-NEXT:    v_sub_co_u32_e64 v0, vcc, s2, v1
1844; GCN64-NEXT:    s_mov_b32 s2, -1
1845; GCN64-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc
1846; GCN64-NEXT:    s_mov_b32 s3, 0x31016000
1847; GCN64-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1848; GCN64-NEXT:    s_endpgm
1849;
1850; GCN32-LABEL: sub_i64_constant:
1851; GCN32:       ; %bb.0: ; %entry
1852; GCN32-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1853; GCN32-NEXT:    s_mov_b32 s5, exec_lo
1854; GCN32-NEXT:    ; implicit-def: $vgpr1_vgpr2
1855; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s5, 0
1856; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1857; GCN32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1858; GCN32-NEXT:    s_cbranch_execz BB9_2
1859; GCN32-NEXT:  ; %bb.1:
1860; GCN32-NEXT:    s_bcnt1_i32_b32 s5, s5
1861; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
1862; GCN32-NEXT:    s_mul_i32 s6, s5, 5
1863; GCN32-NEXT:    v_mul_hi_u32_u24_e64 v2, s5, 5
1864; GCN32-NEXT:    v_mov_b32_e32 v1, s6
1865; GCN32-NEXT:    s_mov_b32 s10, -1
1866; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1867; GCN32-NEXT:    s_mov_b32 s8, s2
1868; GCN32-NEXT:    s_mov_b32 s9, s3
1869; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1870; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
1871; GCN32-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
1872; GCN32-NEXT:    s_waitcnt vmcnt(0)
1873; GCN32-NEXT:    buffer_gl0_inv
1874; GCN32-NEXT:    buffer_gl1_inv
1875; GCN32-NEXT:  BB9_2:
1876; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
1877; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1878; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
1879; GCN32-NEXT:    v_readfirstlane_b32 s2, v1
1880; GCN32-NEXT:    v_mul_u32_u24_e32 v1, 5, v0
1881; GCN32-NEXT:    v_readfirstlane_b32 s3, v2
1882; GCN32-NEXT:    v_mul_hi_u32_u24_e32 v2, 5, v0
1883; GCN32-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s2, v1
1884; GCN32-NEXT:    s_mov_b32 s2, -1
1885; GCN32-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo
1886; GCN32-NEXT:    s_mov_b32 s3, 0x31016000
1887; GCN32-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1888; GCN32-NEXT:    s_endpgm
1889entry:
1890  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
1891  store i64 %old, i64 addrspace(1)* %out
1892  ret void
1893}
1894
1895define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
1896; GFX7LESS-LABEL: sub_i64_uniform:
1897; GFX7LESS:       ; %bb.0: ; %entry
1898; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
1899; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1900; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1901; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
1902; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s9, v0
1903; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1904; GFX7LESS-NEXT:    ; implicit-def: $vgpr1_vgpr2
1905; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1906; GFX7LESS-NEXT:    s_cbranch_execz BB10_2
1907; GFX7LESS-NEXT:  ; %bb.1:
1908; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1909; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1910; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1911; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1912; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1913; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1914; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
1915; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1916; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v1
1917; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
1918; GFX7LESS-NEXT:    v_add_i32_e32 v2, vcc, s7, v1
1919; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1920; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1921; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
1922; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1923; GFX7LESS-NEXT:    buffer_wbinvl1
1924; GFX7LESS-NEXT:  BB10_2:
1925; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1926; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1927; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1928; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1929; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v1
1930; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v2
1931; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1932; GFX7LESS-NEXT:    v_mul_lo_u32 v1, s1, v0
1933; GFX7LESS-NEXT:    v_mul_hi_u32 v2, s0, v0
1934; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s0, v0
1935; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
1936; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s3
1937; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
1938; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1939; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1940; GFX7LESS-NEXT:    s_endpgm
1941;
1942; GFX8-LABEL: sub_i64_uniform:
1943; GFX8:       ; %bb.0: ; %entry
1944; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1945; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1946; GFX8-NEXT:    s_mov_b64 s[8:9], exec
1947; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1948; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
1949; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1950; GFX8-NEXT:    ; implicit-def: $vgpr1_vgpr2
1951; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1952; GFX8-NEXT:    s_cbranch_execz BB10_2
1953; GFX8-NEXT:  ; %bb.1:
1954; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1955; GFX8-NEXT:    s_mov_b32 s12, s6
1956; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1957; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1958; GFX8-NEXT:    v_mul_hi_u32 v1, s0, v1
1959; GFX8-NEXT:    s_mov_b32 s13, s7
1960; GFX8-NEXT:    s_mul_i32 s7, s1, s6
1961; GFX8-NEXT:    s_mul_i32 s6, s0, s6
1962; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s7, v1
1963; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1964; GFX8-NEXT:    s_mov_b32 s14, -1
1965; GFX8-NEXT:    v_mov_b32_e32 v1, s6
1966; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1967; GFX8-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
1968; GFX8-NEXT:    s_waitcnt vmcnt(0)
1969; GFX8-NEXT:    buffer_wbinvl1_vol
1970; GFX8-NEXT:  BB10_2:
1971; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1972; GFX8-NEXT:    v_readfirstlane_b32 s2, v1
1973; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1974; GFX8-NEXT:    v_mul_lo_u32 v1, s1, v0
1975; GFX8-NEXT:    v_mul_hi_u32 v3, s0, v0
1976; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v0
1977; GFX8-NEXT:    v_readfirstlane_b32 s1, v2
1978; GFX8-NEXT:    v_mov_b32_e32 v2, s1
1979; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1980; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
1981; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1982; GFX8-NEXT:    s_mov_b32 s6, -1
1983; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
1984; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1985; GFX8-NEXT:    s_endpgm
1986;
1987; GFX9-LABEL: sub_i64_uniform:
1988; GFX9:       ; %bb.0: ; %entry
1989; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1990; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1991; GFX9-NEXT:    s_mov_b64 s[8:9], exec
1992; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1993; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s9, v0
1994; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1995; GFX9-NEXT:    ; implicit-def: $vgpr1_vgpr2
1996; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1997; GFX9-NEXT:    s_cbranch_execz BB10_2
1998; GFX9-NEXT:  ; %bb.1:
1999; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2000; GFX9-NEXT:    s_mov_b32 s12, s6
2001; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
2002; GFX9-NEXT:    s_mov_b32 s13, s7
2003; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2004; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2005; GFX9-NEXT:    s_add_i32 s8, s8, s7
2006; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2007; GFX9-NEXT:    s_mov_b32 s15, 0xf000
2008; GFX9-NEXT:    s_mov_b32 s14, -1
2009; GFX9-NEXT:    v_mov_b32_e32 v1, s6
2010; GFX9-NEXT:    v_mov_b32_e32 v2, s8
2011; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2012; GFX9-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc
2013; GFX9-NEXT:    s_waitcnt vmcnt(0)
2014; GFX9-NEXT:    buffer_wbinvl1_vol
2015; GFX9-NEXT:  BB10_2:
2016; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2017; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2018; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
2019; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
2020; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v0
2021; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
2022; GFX9-NEXT:    v_readfirstlane_b32 s1, v2
2023; GFX9-NEXT:    v_add_u32_e32 v1, v4, v3
2024; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2025; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v0
2026; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2027; GFX9-NEXT:    s_mov_b32 s6, -1
2028; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2029; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2030; GFX9-NEXT:    s_endpgm
2031;
2032; GCN64-LABEL: sub_i64_uniform:
2033; GCN64:       ; %bb.0: ; %entry
2034; GCN64-NEXT:    s_clause 0x1
2035; GCN64-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2036; GCN64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2037; GCN64-NEXT:    s_mov_b64 s[8:9], exec
2038; GCN64-NEXT:    ; implicit-def: $vgpr1_vgpr2
2039; GCN64-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
2040; GCN64-NEXT:    v_mbcnt_hi_u32_b32_e64 v0, s9, v0
2041; GCN64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2042; GCN64-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2043; GCN64-NEXT:    s_cbranch_execz BB10_2
2044; GCN64-NEXT:  ; %bb.1:
2045; GCN64-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2046; GCN64-NEXT:    s_mov_b32 s11, 0x31016000
2047; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
2048; GCN64-NEXT:    s_mul_i32 s9, s3, s8
2049; GCN64-NEXT:    s_mul_hi_u32 s10, s2, s8
2050; GCN64-NEXT:    s_mul_i32 s8, s2, s8
2051; GCN64-NEXT:    s_add_i32 s10, s10, s9
2052; GCN64-NEXT:    v_mov_b32_e32 v1, s8
2053; GCN64-NEXT:    v_mov_b32_e32 v2, s10
2054; GCN64-NEXT:    s_mov_b32 s10, -1
2055; GCN64-NEXT:    s_mov_b32 s8, s6
2056; GCN64-NEXT:    s_mov_b32 s9, s7
2057; GCN64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2058; GCN64-NEXT:    s_waitcnt_vscnt null, 0x0
2059; GCN64-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
2060; GCN64-NEXT:    s_waitcnt vmcnt(0)
2061; GCN64-NEXT:    buffer_gl0_inv
2062; GCN64-NEXT:    buffer_gl1_inv
2063; GCN64-NEXT:  BB10_2:
2064; GCN64-NEXT:    s_waitcnt_depctr 0xffe3
2065; GCN64-NEXT:    s_or_b64 exec, exec, s[0:1]
2066; GCN64-NEXT:    s_waitcnt lgkmcnt(0)
2067; GCN64-NEXT:    v_mul_lo_u32 v3, s3, v0
2068; GCN64-NEXT:    v_mul_hi_u32 v4, s2, v0
2069; GCN64-NEXT:    v_mul_lo_u32 v0, s2, v0
2070; GCN64-NEXT:    v_readfirstlane_b32 s0, v1
2071; GCN64-NEXT:    v_readfirstlane_b32 s1, v2
2072; GCN64-NEXT:    s_mov_b32 s7, 0x31016000
2073; GCN64-NEXT:    s_mov_b32 s6, -1
2074; GCN64-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2075; GCN64-NEXT:    v_sub_co_u32_e64 v0, vcc, s0, v0
2076; GCN64-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
2077; GCN64-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2078; GCN64-NEXT:    s_endpgm
2079;
2080; GCN32-LABEL: sub_i64_uniform:
2081; GCN32:       ; %bb.0: ; %entry
2082; GCN32-NEXT:    s_clause 0x1
2083; GCN32-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2084; GCN32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2085; GCN32-NEXT:    s_mov_b32 s8, exec_lo
2086; GCN32-NEXT:    ; implicit-def: $vgpr1_vgpr2
2087; GCN32-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
2088; GCN32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2089; GCN32-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2090; GCN32-NEXT:    s_cbranch_execz BB10_2
2091; GCN32-NEXT:  ; %bb.1:
2092; GCN32-NEXT:    s_bcnt1_i32_b32 s1, s8
2093; GCN32-NEXT:    s_mov_b32 s11, 0x31016000
2094; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
2095; GCN32-NEXT:    s_mul_i32 s8, s3, s1
2096; GCN32-NEXT:    s_mul_hi_u32 s9, s2, s1
2097; GCN32-NEXT:    s_mul_i32 s1, s2, s1
2098; GCN32-NEXT:    s_add_i32 s9, s9, s8
2099; GCN32-NEXT:    v_mov_b32_e32 v1, s1
2100; GCN32-NEXT:    v_mov_b32_e32 v2, s9
2101; GCN32-NEXT:    s_mov_b32 s10, -1
2102; GCN32-NEXT:    s_mov_b32 s8, s6
2103; GCN32-NEXT:    s_mov_b32 s9, s7
2104; GCN32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2105; GCN32-NEXT:    s_waitcnt_vscnt null, 0x0
2106; GCN32-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc
2107; GCN32-NEXT:    s_waitcnt vmcnt(0)
2108; GCN32-NEXT:    buffer_gl0_inv
2109; GCN32-NEXT:    buffer_gl1_inv
2110; GCN32-NEXT:  BB10_2:
2111; GCN32-NEXT:    s_waitcnt_depctr 0xffe3
2112; GCN32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2113; GCN32-NEXT:    s_waitcnt lgkmcnt(0)
2114; GCN32-NEXT:    v_mul_lo_u32 v3, s3, v0
2115; GCN32-NEXT:    v_mul_hi_u32 v4, s2, v0
2116; GCN32-NEXT:    v_mul_lo_u32 v0, s2, v0
2117; GCN32-NEXT:    v_readfirstlane_b32 s0, v1
2118; GCN32-NEXT:    v_readfirstlane_b32 s1, v2
2119; GCN32-NEXT:    s_mov_b32 s7, 0x31016000
2120; GCN32-NEXT:    s_mov_b32 s6, -1
2121; GCN32-NEXT:    v_add_nc_u32_e32 v1, v4, v3
2122; GCN32-NEXT:    v_sub_co_u32_e64 v0, vcc_lo, s0, v0
2123; GCN32-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2124; GCN32-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2125; GCN32-NEXT:    s_endpgm
2126entry:
2127  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
2128  store i64 %old, i64 addrspace(1)* %out
2129  ret void
2130}
2131
2132define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
2133; GFX7LESS-LABEL: sub_i64_varying:
2134; GFX7LESS:       ; %bb.0: ; %entry
2135; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2136; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2137; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2138; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2139; GFX7LESS-NEXT:    s_mov_b32 s10, s6
2140; GFX7LESS-NEXT:    s_mov_b32 s11, s7
2141; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2142; GFX7LESS-NEXT:    s_mov_b32 s8, s2
2143; GFX7LESS-NEXT:    s_mov_b32 s9, s3
2144; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2145; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2146; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
2147; GFX7LESS-NEXT:    buffer_wbinvl1
2148; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2149; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2150; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2151; GFX7LESS-NEXT:    s_endpgm
2152;
2153; GFX89-LABEL: sub_i64_varying:
2154; GFX89:       ; %bb.0: ; %entry
2155; GFX89-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2156; GFX89-NEXT:    s_mov_b32 s3, 0xf000
2157; GFX89-NEXT:    s_mov_b32 s2, -1
2158; GFX89-NEXT:    v_mov_b32_e32 v1, 0
2159; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
2160; GFX89-NEXT:    s_mov_b32 s0, s4
2161; GFX89-NEXT:    s_mov_b32 s1, s5
2162; GFX89-NEXT:    s_mov_b32 s4, s6
2163; GFX89-NEXT:    s_mov_b32 s5, s7
2164; GFX89-NEXT:    s_mov_b32 s6, s2
2165; GFX89-NEXT:    s_mov_b32 s7, s3
2166; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2167; GFX89-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc
2168; GFX89-NEXT:    s_waitcnt vmcnt(0)
2169; GFX89-NEXT:    buffer_wbinvl1_vol
2170; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2171; GFX89-NEXT:    s_endpgm
2172;
2173; GFX10-LABEL: sub_i64_varying:
2174; GFX10:       ; %bb.0: ; %entry
2175; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2176; GFX10-NEXT:    v_mov_b32_e32 v1, 0
2177; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
2178; GFX10-NEXT:    s_mov_b32 s6, -1
2179; GFX10-NEXT:    s_mov_b32 s11, s7
2180; GFX10-NEXT:    s_mov_b32 s10, s6
2181; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2182; GFX10-NEXT:    s_mov_b32 s8, s2
2183; GFX10-NEXT:    s_mov_b32 s9, s3
2184; GFX10-NEXT:    s_mov_b32 s4, s0
2185; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2186; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2187; GFX10-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2188; GFX10-NEXT:    s_waitcnt vmcnt(0)
2189; GFX10-NEXT:    buffer_gl0_inv
2190; GFX10-NEXT:    buffer_gl1_inv
2191; GFX10-NEXT:    s_mov_b32 s5, s1
2192; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2193; GFX10-NEXT:    s_endpgm
2194entry:
2195  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2196  %zext = zext i32 %lane to i64
2197  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
2198  store i64 %old, i64 addrspace(1)* %out
2199  ret void
2200}
2201