1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
3; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1164 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX1132 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11
12; Show what the atomic optimization pass will do for global pointers.
13
14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
15; GFX7LESS-LABEL: add_i32_constant:
16; GFX7LESS:       ; %bb.0: ; %entry
17; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
18; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
19; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
20; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
21; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
22; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
23; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
24; GFX7LESS-NEXT:    s_cbranch_execz .LBB0_2
25; GFX7LESS-NEXT:  ; %bb.1:
26; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
27; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
28; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
29; GFX7LESS-NEXT:    s_mov_b32 s10, -1
30; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX7LESS-NEXT:    s_mov_b32 s8, s2
32; GFX7LESS-NEXT:    s_mov_b32 s9, s3
33; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
34; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
35; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
36; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
37; GFX7LESS-NEXT:    buffer_wbinvl1
38; GFX7LESS-NEXT:  .LBB0_2:
39; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
40; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
42; GFX7LESS-NEXT:    s_mov_b32 s2, -1
43; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
44; GFX7LESS-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
45; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
46; GFX7LESS-NEXT:    s_endpgm
47;
48; GFX89-LABEL: add_i32_constant:
49; GFX89:       ; %bb.0: ; %entry
50; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
51; GFX89-NEXT:    s_mov_b64 s[6:7], exec
52; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
53; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
54; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
55; GFX89-NEXT:    ; implicit-def: $vgpr1
56; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
57; GFX89-NEXT:    s_cbranch_execz .LBB0_2
58; GFX89-NEXT:  ; %bb.1:
59; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX89-NEXT:    s_mov_b32 s8, s2
61; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
62; GFX89-NEXT:    s_mul_i32 s2, s2, 5
63; GFX89-NEXT:    s_mov_b32 s11, 0xf000
64; GFX89-NEXT:    s_mov_b32 s10, -1
65; GFX89-NEXT:    s_mov_b32 s9, s3
66; GFX89-NEXT:    v_mov_b32_e32 v1, s2
67; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
68; GFX89-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
69; GFX89-NEXT:    s_waitcnt vmcnt(0)
70; GFX89-NEXT:    buffer_wbinvl1_vol
71; GFX89-NEXT:  .LBB0_2:
72; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
73; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
74; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
75; GFX89-NEXT:    s_mov_b32 s3, 0xf000
76; GFX89-NEXT:    s_mov_b32 s2, -1
77; GFX89-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
78; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
79; GFX89-NEXT:    s_endpgm
80;
81; GFX1064-LABEL: add_i32_constant:
82; GFX1064:       ; %bb.0: ; %entry
83; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
84; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
85; GFX1064-NEXT:    ; implicit-def: $vgpr1
86; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
87; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
88; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
89; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
90; GFX1064-NEXT:    s_cbranch_execz .LBB0_2
91; GFX1064-NEXT:  ; %bb.1:
92; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
93; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
94; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
95; GFX1064-NEXT:    s_mov_b32 s10, -1
96; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
97; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX1064-NEXT:    s_mov_b32 s8, s2
99; GFX1064-NEXT:    s_mov_b32 s9, s3
100; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
101; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
102; GFX1064-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
103; GFX1064-NEXT:    s_waitcnt vmcnt(0)
104; GFX1064-NEXT:    buffer_gl0_inv
105; GFX1064-NEXT:    buffer_gl1_inv
106; GFX1064-NEXT:  .LBB0_2:
107; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
108; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
109; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
110; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
111; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
112; GFX1064-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
113; GFX1064-NEXT:    s_mov_b32 s2, -1
114; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
115; GFX1064-NEXT:    s_endpgm
116;
117; GFX1032-LABEL: add_i32_constant:
118; GFX1032:       ; %bb.0: ; %entry
119; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
120; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
121; GFX1032-NEXT:    ; implicit-def: $vgpr1
122; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
123; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
124; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
125; GFX1032-NEXT:    s_cbranch_execz .LBB0_2
126; GFX1032-NEXT:  ; %bb.1:
127; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
128; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
129; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
130; GFX1032-NEXT:    s_mov_b32 s10, -1
131; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
132; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
133; GFX1032-NEXT:    s_mov_b32 s8, s2
134; GFX1032-NEXT:    s_mov_b32 s9, s3
135; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
136; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
137; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
138; GFX1032-NEXT:    s_waitcnt vmcnt(0)
139; GFX1032-NEXT:    buffer_gl0_inv
140; GFX1032-NEXT:    buffer_gl1_inv
141; GFX1032-NEXT:  .LBB0_2:
142; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
143; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
144; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
145; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
146; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
147; GFX1032-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
148; GFX1032-NEXT:    s_mov_b32 s2, -1
149; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
150; GFX1032-NEXT:    s_endpgm
151;
152; GFX1164-LABEL: add_i32_constant:
153; GFX1164:       ; %bb.0: ; %entry
154; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
155; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
156; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
157; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
158; GFX1164-NEXT:    ; implicit-def: $vgpr1
159; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
160; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
161; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
162; GFX1164-NEXT:    s_cbranch_execz .LBB0_2
163; GFX1164-NEXT:  ; %bb.1:
164; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
165; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
166; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
167; GFX1164-NEXT:    s_mov_b32 s10, -1
168; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
169; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX1164-NEXT:    s_mov_b32 s8, s2
171; GFX1164-NEXT:    s_mov_b32 s9, s3
172; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
173; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
174; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
175; GFX1164-NEXT:    s_waitcnt vmcnt(0)
176; GFX1164-NEXT:    buffer_gl0_inv
177; GFX1164-NEXT:    buffer_gl1_inv
178; GFX1164-NEXT:  .LBB0_2:
179; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
180; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
181; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
182; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
183; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
184; GFX1164-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
185; GFX1164-NEXT:    s_mov_b32 s2, -1
186; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
187; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
188; GFX1164-NEXT:    s_endpgm
189;
190; GFX1132-LABEL: add_i32_constant:
191; GFX1132:       ; %bb.0: ; %entry
192; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
193; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
194; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
195; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
196; GFX1132-NEXT:    ; implicit-def: $vgpr1
197; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
198; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
199; GFX1132-NEXT:    s_cbranch_execz .LBB0_2
200; GFX1132-NEXT:  ; %bb.1:
201; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
202; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
203; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
204; GFX1132-NEXT:    s_mov_b32 s10, -1
205; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
206; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
207; GFX1132-NEXT:    s_mov_b32 s8, s2
208; GFX1132-NEXT:    s_mov_b32 s9, s3
209; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
210; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
211; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
212; GFX1132-NEXT:    s_waitcnt vmcnt(0)
213; GFX1132-NEXT:    buffer_gl0_inv
214; GFX1132-NEXT:    buffer_gl1_inv
215; GFX1132-NEXT:  .LBB0_2:
216; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
217; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
219; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
220; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
221; GFX1132-NEXT:    v_mad_u32_u24 v0, v0, 5, s2
222; GFX1132-NEXT:    s_mov_b32 s2, -1
223; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
224; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
225; GFX1132-NEXT:    s_endpgm
226entry:
227  %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel
228  store i32 %old, i32 addrspace(1)* %out
229  ret void
230}
231
232define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) {
233; GFX7LESS-LABEL: add_i32_uniform:
234; GFX7LESS:       ; %bb.0: ; %entry
235; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
236; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
237; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
238; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
239; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
240; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
241; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
242; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
243; GFX7LESS-NEXT:    s_cbranch_execz .LBB1_2
244; GFX7LESS-NEXT:  ; %bb.1:
245; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
246; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
247; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
249; GFX7LESS-NEXT:    s_mov_b32 s14, -1
250; GFX7LESS-NEXT:    s_mov_b32 s12, s6
251; GFX7LESS-NEXT:    s_mov_b32 s13, s7
252; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
253; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
254; GFX7LESS-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
255; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
256; GFX7LESS-NEXT:    buffer_wbinvl1
257; GFX7LESS-NEXT:  .LBB1_2:
258; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
259; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
260; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
261; GFX7LESS-NEXT:    s_mov_b32 s6, -1
262; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
263; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
264; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
265; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
266; GFX7LESS-NEXT:    s_endpgm
267;
268; GFX8-LABEL: add_i32_uniform:
269; GFX8:       ; %bb.0: ; %entry
270; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
271; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
272; GFX8-NEXT:    s_mov_b64 s[2:3], exec
273; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
274; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
275; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
276; GFX8-NEXT:    ; implicit-def: $vgpr1
277; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
278; GFX8-NEXT:    s_cbranch_execz .LBB1_2
279; GFX8-NEXT:  ; %bb.1:
280; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
281; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX8-NEXT:    s_mul_i32 s2, s8, s2
283; GFX8-NEXT:    s_mov_b32 s15, 0xf000
284; GFX8-NEXT:    s_mov_b32 s14, -1
285; GFX8-NEXT:    s_mov_b32 s12, s6
286; GFX8-NEXT:    s_mov_b32 s13, s7
287; GFX8-NEXT:    v_mov_b32_e32 v1, s2
288; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
289; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
290; GFX8-NEXT:    s_waitcnt vmcnt(0)
291; GFX8-NEXT:    buffer_wbinvl1_vol
292; GFX8-NEXT:  .LBB1_2:
293; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
294; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
296; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
297; GFX8-NEXT:    s_mov_b32 s7, 0xf000
298; GFX8-NEXT:    s_mov_b32 s6, -1
299; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
300; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
301; GFX8-NEXT:    s_endpgm
302;
303; GFX9-LABEL: add_i32_uniform:
304; GFX9:       ; %bb.0: ; %entry
305; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
306; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
307; GFX9-NEXT:    s_mov_b64 s[2:3], exec
308; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
309; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
310; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
311; GFX9-NEXT:    ; implicit-def: $vgpr1
312; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
313; GFX9-NEXT:    s_cbranch_execz .LBB1_2
314; GFX9-NEXT:  ; %bb.1:
315; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
316; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
317; GFX9-NEXT:    s_mul_i32 s2, s8, s2
318; GFX9-NEXT:    s_mov_b32 s15, 0xf000
319; GFX9-NEXT:    s_mov_b32 s14, -1
320; GFX9-NEXT:    s_mov_b32 s12, s6
321; GFX9-NEXT:    s_mov_b32 s13, s7
322; GFX9-NEXT:    v_mov_b32_e32 v1, s2
323; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
324; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
325; GFX9-NEXT:    s_waitcnt vmcnt(0)
326; GFX9-NEXT:    buffer_wbinvl1_vol
327; GFX9-NEXT:  .LBB1_2:
328; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
331; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
332; GFX9-NEXT:    s_mov_b32 s7, 0xf000
333; GFX9-NEXT:    s_mov_b32 s6, -1
334; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
335; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
336; GFX9-NEXT:    s_endpgm
337;
338; GFX1064-LABEL: add_i32_uniform:
339; GFX1064:       ; %bb.0: ; %entry
340; GFX1064-NEXT:    s_clause 0x1
341; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
342; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
343; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
344; GFX1064-NEXT:    ; implicit-def: $vgpr1
345; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
346; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
347; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
348; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
349; GFX1064-NEXT:    s_cbranch_execz .LBB1_2
350; GFX1064-NEXT:  ; %bb.1:
351; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
352; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
353; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
354; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
355; GFX1064-NEXT:    s_mov_b32 s14, -1
356; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
357; GFX1064-NEXT:    s_mov_b32 s12, s6
358; GFX1064-NEXT:    s_mov_b32 s13, s7
359; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
360; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
361; GFX1064-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
362; GFX1064-NEXT:    s_waitcnt vmcnt(0)
363; GFX1064-NEXT:    buffer_gl0_inv
364; GFX1064-NEXT:    buffer_gl1_inv
365; GFX1064-NEXT:  .LBB1_2:
366; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
367; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
368; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
369; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
371; GFX1064-NEXT:    s_mov_b32 s6, -1
372; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
373; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
374; GFX1064-NEXT:    s_endpgm
375;
376; GFX1032-LABEL: add_i32_uniform:
377; GFX1032:       ; %bb.0: ; %entry
378; GFX1032-NEXT:    s_clause 0x1
379; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
380; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x34
381; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
382; GFX1032-NEXT:    ; implicit-def: $vgpr1
383; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
384; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
385; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
386; GFX1032-NEXT:    s_cbranch_execz .LBB1_2
387; GFX1032-NEXT:  ; %bb.1:
388; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
389; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
390; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
391; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
392; GFX1032-NEXT:    s_mov_b32 s10, -1
393; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
394; GFX1032-NEXT:    s_mov_b32 s8, s6
395; GFX1032-NEXT:    s_mov_b32 s9, s7
396; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
397; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
398; GFX1032-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
399; GFX1032-NEXT:    s_waitcnt vmcnt(0)
400; GFX1032-NEXT:    buffer_gl0_inv
401; GFX1032-NEXT:    buffer_gl1_inv
402; GFX1032-NEXT:  .LBB1_2:
403; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
404; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
405; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
406; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
407; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
408; GFX1032-NEXT:    s_mov_b32 s6, -1
409; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v0, s[0:1]
410; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
411; GFX1032-NEXT:    s_endpgm
412;
413; GFX1164-LABEL: add_i32_uniform:
414; GFX1164:       ; %bb.0: ; %entry
415; GFX1164-NEXT:    s_clause 0x1
416; GFX1164-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
417; GFX1164-NEXT:    s_load_b32 s8, s[0:1], 0x34
418; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
419; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
420; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
421; GFX1164-NEXT:    ; implicit-def: $vgpr1
422; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
423; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
424; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
425; GFX1164-NEXT:    s_cbranch_execz .LBB1_2
426; GFX1164-NEXT:  ; %bb.1:
427; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
428; GFX1164-NEXT:    s_mov_b32 s15, 0x31016000
429; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
430; GFX1164-NEXT:    s_mul_i32 s2, s8, s2
431; GFX1164-NEXT:    s_mov_b32 s14, -1
432; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
433; GFX1164-NEXT:    s_mov_b32 s12, s6
434; GFX1164-NEXT:    s_mov_b32 s13, s7
435; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
436; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
437; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
438; GFX1164-NEXT:    s_waitcnt vmcnt(0)
439; GFX1164-NEXT:    buffer_gl0_inv
440; GFX1164-NEXT:    buffer_gl1_inv
441; GFX1164-NEXT:  .LBB1_2:
442; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
443; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
444; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
445; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
446; GFX1164-NEXT:    s_mov_b32 s6, -1
447; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
448; GFX1164-NEXT:    v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
449; GFX1164-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
450; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
451; GFX1164-NEXT:    s_endpgm
452;
453; GFX1132-LABEL: add_i32_uniform:
454; GFX1132:       ; %bb.0: ; %entry
455; GFX1132-NEXT:    s_clause 0x1
456; GFX1132-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
457; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x34
458; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
459; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
460; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
461; GFX1132-NEXT:    ; implicit-def: $vgpr1
462; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
463; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
464; GFX1132-NEXT:    s_cbranch_execz .LBB1_2
465; GFX1132-NEXT:  ; %bb.1:
466; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
467; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
468; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
470; GFX1132-NEXT:    s_mov_b32 s10, -1
471; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
472; GFX1132-NEXT:    s_mov_b32 s8, s6
473; GFX1132-NEXT:    s_mov_b32 s9, s7
474; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
475; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
476; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
477; GFX1132-NEXT:    s_waitcnt vmcnt(0)
478; GFX1132-NEXT:    buffer_gl0_inv
479; GFX1132-NEXT:    buffer_gl1_inv
480; GFX1132-NEXT:  .LBB1_2:
481; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
482; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
483; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
484; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
485; GFX1132-NEXT:    s_mov_b32 s6, -1
486; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
487; GFX1132-NEXT:    v_mad_u64_u32 v[1:2], null, s0, v0, s[2:3]
488; GFX1132-NEXT:    buffer_store_b32 v1, off, s[4:7], 0
489; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
490; GFX1132-NEXT:    s_endpgm
491entry:
492  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel
493  store i32 %old, i32 addrspace(1)* %out
494  ret void
495}
496
497define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
498; GFX7LESS-LABEL: add_i32_varying:
499; GFX7LESS:       ; %bb.0: ; %entry
500; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
501; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
502; GFX7LESS-NEXT:    s_mov_b32 s6, -1
503; GFX7LESS-NEXT:    s_mov_b32 s10, s6
504; GFX7LESS-NEXT:    s_mov_b32 s11, s7
505; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
506; GFX7LESS-NEXT:    s_mov_b32 s8, s2
507; GFX7LESS-NEXT:    s_mov_b32 s9, s3
508; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
509; GFX7LESS-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
510; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
511; GFX7LESS-NEXT:    buffer_wbinvl1
512; GFX7LESS-NEXT:    s_mov_b32 s4, s0
513; GFX7LESS-NEXT:    s_mov_b32 s5, s1
514; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
515; GFX7LESS-NEXT:    s_endpgm
516;
517; GFX8-LABEL: add_i32_varying:
518; GFX8:       ; %bb.0: ; %entry
519; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
520; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
521; GFX8-NEXT:    v_mov_b32_e32 v1, 0
522; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
523; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
524; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
525; GFX8-NEXT:    v_mov_b32_e32 v2, v0
526; GFX8-NEXT:    s_not_b64 exec, exec
527; GFX8-NEXT:    v_mov_b32_e32 v2, 0
528; GFX8-NEXT:    s_not_b64 exec, exec
529; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
530; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
531; GFX8-NEXT:    s_nop 1
532; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
533; GFX8-NEXT:    s_nop 1
534; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
535; GFX8-NEXT:    s_nop 1
536; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
537; GFX8-NEXT:    s_nop 1
538; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
539; GFX8-NEXT:    s_nop 1
540; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
541; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
542; GFX8-NEXT:    s_nop 0
543; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
544; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
545; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
546; GFX8-NEXT:    ; implicit-def: $vgpr0
547; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
548; GFX8-NEXT:    s_cbranch_execz .LBB2_2
549; GFX8-NEXT:  ; %bb.1:
550; GFX8-NEXT:    s_mov_b32 s11, 0xf000
551; GFX8-NEXT:    s_mov_b32 s10, -1
552; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX8-NEXT:    s_mov_b32 s8, s2
554; GFX8-NEXT:    s_mov_b32 s9, s3
555; GFX8-NEXT:    v_mov_b32_e32 v0, s6
556; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
557; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
558; GFX8-NEXT:    s_waitcnt vmcnt(0)
559; GFX8-NEXT:    buffer_wbinvl1_vol
560; GFX8-NEXT:  .LBB2_2:
561; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
562; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
563; GFX8-NEXT:    v_mov_b32_e32 v0, v1
564; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX8-NEXT:    s_mov_b32 s3, 0xf000
566; GFX8-NEXT:    s_mov_b32 s2, -1
567; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v0
568; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
569; GFX8-NEXT:    s_endpgm
570;
571; GFX9-LABEL: add_i32_varying:
572; GFX9:       ; %bb.0: ; %entry
573; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
574; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
575; GFX9-NEXT:    v_mov_b32_e32 v1, 0
576; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
577; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
578; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
579; GFX9-NEXT:    v_mov_b32_e32 v2, v0
580; GFX9-NEXT:    s_not_b64 exec, exec
581; GFX9-NEXT:    v_mov_b32_e32 v2, 0
582; GFX9-NEXT:    s_not_b64 exec, exec
583; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
584; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
585; GFX9-NEXT:    s_nop 1
586; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
587; GFX9-NEXT:    s_nop 1
588; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
589; GFX9-NEXT:    s_nop 1
590; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
591; GFX9-NEXT:    s_nop 1
592; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
593; GFX9-NEXT:    s_nop 1
594; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
595; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
596; GFX9-NEXT:    s_nop 0
597; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
598; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
599; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
600; GFX9-NEXT:    ; implicit-def: $vgpr0
601; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
602; GFX9-NEXT:    s_cbranch_execz .LBB2_2
603; GFX9-NEXT:  ; %bb.1:
604; GFX9-NEXT:    s_mov_b32 s11, 0xf000
605; GFX9-NEXT:    s_mov_b32 s10, -1
606; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX9-NEXT:    s_mov_b32 s8, s2
608; GFX9-NEXT:    s_mov_b32 s9, s3
609; GFX9-NEXT:    v_mov_b32_e32 v0, s6
610; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
611; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
612; GFX9-NEXT:    s_waitcnt vmcnt(0)
613; GFX9-NEXT:    buffer_wbinvl1_vol
614; GFX9-NEXT:  .LBB2_2:
615; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
616; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
617; GFX9-NEXT:    v_mov_b32_e32 v0, v1
618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX9-NEXT:    s_mov_b32 s3, 0xf000
620; GFX9-NEXT:    s_mov_b32 s2, -1
621; GFX9-NEXT:    v_add_u32_e32 v0, s4, v0
622; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
623; GFX9-NEXT:    s_endpgm
624;
625; GFX1064-LABEL: add_i32_varying:
626; GFX1064:       ; %bb.0: ; %entry
627; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
628; GFX1064-NEXT:    s_not_b64 exec, exec
629; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
630; GFX1064-NEXT:    s_not_b64 exec, exec
631; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
632; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
633; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
634; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
635; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
637; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
638; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
639; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
640; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
641; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
642; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
643; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
644; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
645; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
646; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
647; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
648; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
649; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
650; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
651; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
652; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
653; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
654; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
655; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
656; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
657; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
658; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
659; GFX1064-NEXT:    s_mov_b32 s4, s9
660; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
661; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
662; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
663; GFX1064-NEXT:    s_mov_b32 s6, -1
664; GFX1064-NEXT:    ; implicit-def: $vgpr0
665; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
666; GFX1064-NEXT:    s_cbranch_execz .LBB2_2
667; GFX1064-NEXT:  ; %bb.1:
668; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
669; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
670; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX1064-NEXT:    s_mov_b32 s4, s2
672; GFX1064-NEXT:    s_mov_b32 s5, s3
673; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
674; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
675; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
676; GFX1064-NEXT:    s_waitcnt vmcnt(0)
677; GFX1064-NEXT:    buffer_gl0_inv
678; GFX1064-NEXT:    buffer_gl1_inv
679; GFX1064-NEXT:  .LBB2_2:
680; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
681; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
682; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
684; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
685; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
686; GFX1064-NEXT:    v_add_nc_u32_e32 v0, s2, v0
687; GFX1064-NEXT:    s_mov_b32 s2, s6
688; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
689; GFX1064-NEXT:    s_endpgm
690;
691; GFX1032-LABEL: add_i32_varying:
692; GFX1032:       ; %bb.0: ; %entry
693; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
694; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
695; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
696; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
697; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
698; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
699; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
700; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
701; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
702; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
703; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
704; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
705; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
706; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
707; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
708; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
709; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
710; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
711; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
712; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
713; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
714; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
715; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
716; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
717; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
718; GFX1032-NEXT:    s_mov_b32 s4, s6
719; GFX1032-NEXT:    s_mov_b32 s6, -1
720; GFX1032-NEXT:    ; implicit-def: $vgpr0
721; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
722; GFX1032-NEXT:    s_cbranch_execz .LBB2_2
723; GFX1032-NEXT:  ; %bb.1:
724; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
725; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
726; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX1032-NEXT:    s_mov_b32 s4, s2
728; GFX1032-NEXT:    s_mov_b32 s5, s3
729; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
730; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
731; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
732; GFX1032-NEXT:    s_waitcnt vmcnt(0)
733; GFX1032-NEXT:    buffer_gl0_inv
734; GFX1032-NEXT:    buffer_gl1_inv
735; GFX1032-NEXT:  .LBB2_2:
736; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
737; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
738; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
740; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
741; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
742; GFX1032-NEXT:    v_add_nc_u32_e32 v0, s2, v0
743; GFX1032-NEXT:    s_mov_b32 s2, s6
744; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
745; GFX1032-NEXT:    s_endpgm
746;
747; GFX1164-LABEL: add_i32_varying:
748; GFX1164:       ; %bb.0: ; %entry
749; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
750; GFX1164-NEXT:    s_not_b64 exec, exec
751; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
752; GFX1164-NEXT:    s_not_b64 exec, exec
753; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
754; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
755; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
756; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
757; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
758; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
759; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
760; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
761; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
762; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
763; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
764; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
765; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
766; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
767; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
768; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
769; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
770; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
771; GFX1164-NEXT:    v_readlane_b32 s6, v1, 15
772; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
773; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
774; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
775; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
776; GFX1164-NEXT:    v_readlane_b32 s7, v1, 31
777; GFX1164-NEXT:    v_writelane_b32 v3, s6, 16
778; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
779; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
780; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
781; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
782; GFX1164-NEXT:    v_readlane_b32 s8, v1, 47
783; GFX1164-NEXT:    v_readlane_b32 s9, v1, 63
784; GFX1164-NEXT:    v_writelane_b32 v3, s7, 32
785; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
786; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
787; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
788; GFX1164-NEXT:    s_or_saveexec_b64 s[6:7], -1
789; GFX1164-NEXT:    s_mov_b32 s4, s9
790; GFX1164-NEXT:    v_writelane_b32 v3, s8, 48
791; GFX1164-NEXT:    s_mov_b64 exec, s[6:7]
792; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
793; GFX1164-NEXT:    s_mov_b32 s6, -1
794; GFX1164-NEXT:    ; implicit-def: $vgpr0
795; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], vcc
796; GFX1164-NEXT:    s_cbranch_execz .LBB2_2
797; GFX1164-NEXT:  ; %bb.1:
798; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
799; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
800; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
801; GFX1164-NEXT:    s_mov_b32 s4, s2
802; GFX1164-NEXT:    s_mov_b32 s5, s3
803; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
804; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
805; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
806; GFX1164-NEXT:    s_waitcnt vmcnt(0)
807; GFX1164-NEXT:    buffer_gl0_inv
808; GFX1164-NEXT:    buffer_gl1_inv
809; GFX1164-NEXT:  .LBB2_2:
810; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
811; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
813; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
814; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
815; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
816; GFX1164-NEXT:    v_add_nc_u32_e32 v0, s2, v0
817; GFX1164-NEXT:    s_mov_b32 s2, s6
818; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
819; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
820; GFX1164-NEXT:    s_endpgm
821;
822; GFX1132-LABEL: add_i32_varying:
823; GFX1132:       ; %bb.0: ; %entry
824; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
825; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
826; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
827; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
828; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
829; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
830; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
831; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
832; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
833; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
834; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
835; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
836; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
837; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
838; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
839; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
840; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
841; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
842; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
843; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
844; GFX1132-NEXT:    v_readlane_b32 s5, v1, 15
845; GFX1132-NEXT:    v_readlane_b32 s6, v1, 31
846; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
847; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
848; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
849; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
850; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
851; GFX1132-NEXT:    v_writelane_b32 v3, s5, 16
852; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
853; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
854; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
855; GFX1132-NEXT:    s_mov_b32 s4, s6
856; GFX1132-NEXT:    s_mov_b32 s6, -1
857; GFX1132-NEXT:    ; implicit-def: $vgpr0
858; GFX1132-NEXT:    s_and_saveexec_b32 s8, vcc_lo
859; GFX1132-NEXT:    s_cbranch_execz .LBB2_2
860; GFX1132-NEXT:  ; %bb.1:
861; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
862; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
863; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX1132-NEXT:    s_mov_b32 s4, s2
865; GFX1132-NEXT:    s_mov_b32 s5, s3
866; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
867; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
868; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
869; GFX1132-NEXT:    s_waitcnt vmcnt(0)
870; GFX1132-NEXT:    buffer_gl0_inv
871; GFX1132-NEXT:    buffer_gl1_inv
872; GFX1132-NEXT:  .LBB2_2:
873; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
874; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
876; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
877; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
878; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
879; GFX1132-NEXT:    v_add_nc_u32_e32 v0, s2, v0
880; GFX1132-NEXT:    s_mov_b32 s2, s6
881; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
882; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
883; GFX1132-NEXT:    s_endpgm
884entry:
885  %lane = call i32 @llvm.amdgcn.workitem.id.x()
886  %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel
887  store i32 %old, i32 addrspace(1)* %out
888  ret void
889}
890
891define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
892; GFX7LESS-LABEL: add_i64_constant:
893; GFX7LESS:       ; %bb.0: ; %entry
894; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
895; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
896; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
897; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
898; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
899; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
900; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
901; GFX7LESS-NEXT:    s_cbranch_execz .LBB3_2
902; GFX7LESS-NEXT:  ; %bb.1:
903; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
904; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
905; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
906; GFX7LESS-NEXT:    s_mov_b32 s10, -1
907; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX7LESS-NEXT:    s_mov_b32 s8, s2
909; GFX7LESS-NEXT:    s_mov_b32 s9, s3
910; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
911; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
912; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
913; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
914; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
915; GFX7LESS-NEXT:    buffer_wbinvl1
916; GFX7LESS-NEXT:  .LBB3_2:
917; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
918; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
919; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
920; GFX7LESS-NEXT:    s_mov_b32 s2, -1
921; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
922; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
923; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
924; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
925; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
926; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
927; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
928; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
929; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
930; GFX7LESS-NEXT:    s_endpgm
931;
932; GFX89-LABEL: add_i64_constant:
933; GFX89:       ; %bb.0: ; %entry
934; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
935; GFX89-NEXT:    s_mov_b64 s[6:7], exec
936; GFX89-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
937; GFX89-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
938; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
939; GFX89-NEXT:    ; implicit-def: $vgpr0_vgpr1
940; GFX89-NEXT:    s_and_saveexec_b64 s[4:5], vcc
941; GFX89-NEXT:    s_cbranch_execz .LBB3_2
942; GFX89-NEXT:  ; %bb.1:
943; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX89-NEXT:    s_mov_b32 s8, s2
945; GFX89-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
946; GFX89-NEXT:    s_mul_i32 s2, s2, 5
947; GFX89-NEXT:    s_mov_b32 s11, 0xf000
948; GFX89-NEXT:    s_mov_b32 s10, -1
949; GFX89-NEXT:    s_mov_b32 s9, s3
950; GFX89-NEXT:    v_mov_b32_e32 v0, s2
951; GFX89-NEXT:    v_mov_b32_e32 v1, 0
952; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
953; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
954; GFX89-NEXT:    s_waitcnt vmcnt(0)
955; GFX89-NEXT:    buffer_wbinvl1_vol
956; GFX89-NEXT:  .LBB3_2:
957; GFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
958; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
959; GFX89-NEXT:    v_readfirstlane_b32 s2, v0
960; GFX89-NEXT:    v_readfirstlane_b32 s3, v1
961; GFX89-NEXT:    v_mov_b32_e32 v0, s2
962; GFX89-NEXT:    v_mov_b32_e32 v1, s3
963; GFX89-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
964; GFX89-NEXT:    s_mov_b32 s3, 0xf000
965; GFX89-NEXT:    s_mov_b32 s2, -1
966; GFX89-NEXT:    s_nop 2
967; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
968; GFX89-NEXT:    s_endpgm
969;
970; GFX1064-LABEL: add_i64_constant:
971; GFX1064:       ; %bb.0: ; %entry
972; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
973; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
974; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
975; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
976; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
977; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
978; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
979; GFX1064-NEXT:    s_cbranch_execz .LBB3_2
980; GFX1064-NEXT:  ; %bb.1:
981; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
982; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
983; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
984; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
985; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
986; GFX1064-NEXT:    s_mov_b32 s10, -1
987; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
988; GFX1064-NEXT:    s_mov_b32 s8, s2
989; GFX1064-NEXT:    s_mov_b32 s9, s3
990; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
991; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
992; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
993; GFX1064-NEXT:    s_waitcnt vmcnt(0)
994; GFX1064-NEXT:    buffer_gl0_inv
995; GFX1064-NEXT:    buffer_gl1_inv
996; GFX1064-NEXT:  .LBB3_2:
997; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
998; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
999; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1000; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
1001; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
1002; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
1003; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1004; GFX1064-NEXT:    s_mov_b32 s2, -1
1005; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1006; GFX1064-NEXT:    s_endpgm
1007;
1008; GFX1032-LABEL: add_i64_constant:
1009; GFX1032:       ; %bb.0: ; %entry
1010; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1011; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1012; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1013; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1014; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1015; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1016; GFX1032-NEXT:    s_cbranch_execz .LBB3_2
1017; GFX1032-NEXT:  ; %bb.1:
1018; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1019; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
1020; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
1021; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1022; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
1023; GFX1032-NEXT:    s_mov_b32 s10, -1
1024; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX1032-NEXT:    s_mov_b32 s8, s2
1026; GFX1032-NEXT:    s_mov_b32 s9, s3
1027; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1028; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1029; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1030; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1031; GFX1032-NEXT:    buffer_gl0_inv
1032; GFX1032-NEXT:    buffer_gl1_inv
1033; GFX1032-NEXT:  .LBB3_2:
1034; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1035; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1036; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1037; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
1038; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
1039; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
1040; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1041; GFX1032-NEXT:    s_mov_b32 s2, -1
1042; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1043; GFX1032-NEXT:    s_endpgm
1044;
1045; GFX1164-LABEL: add_i64_constant:
1046; GFX1164:       ; %bb.0: ; %entry
1047; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1048; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1049; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1050; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1051; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1052; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
1053; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1054; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1055; GFX1164-NEXT:    s_cbranch_execz .LBB3_2
1056; GFX1164-NEXT:  ; %bb.1:
1057; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1058; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
1059; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
1060; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
1061; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
1062; GFX1164-NEXT:    s_mov_b32 s10, -1
1063; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX1164-NEXT:    s_mov_b32 s8, s2
1065; GFX1164-NEXT:    s_mov_b32 s9, s3
1066; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1067; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1068; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1069; GFX1164-NEXT:    s_waitcnt vmcnt(0)
1070; GFX1164-NEXT:    buffer_gl0_inv
1071; GFX1164-NEXT:    buffer_gl1_inv
1072; GFX1164-NEXT:  .LBB3_2:
1073; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1074; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1076; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1077; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1078; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1079; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1080; GFX1164-NEXT:    s_mov_b32 s2, -1
1081; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1082; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1083; GFX1164-NEXT:    s_endpgm
1084;
1085; GFX1132-LABEL: add_i64_constant:
1086; GFX1132:       ; %bb.0: ; %entry
1087; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1088; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1089; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1090; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
1091; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1092; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1093; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1094; GFX1132-NEXT:    s_cbranch_execz .LBB3_2
1095; GFX1132-NEXT:  ; %bb.1:
1096; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1097; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
1098; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
1099; GFX1132-NEXT:    s_mov_b32 s10, -1
1100; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
1101; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX1132-NEXT:    s_mov_b32 s8, s2
1103; GFX1132-NEXT:    s_mov_b32 s9, s3
1104; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1105; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1106; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1107; GFX1132-NEXT:    s_waitcnt vmcnt(0)
1108; GFX1132-NEXT:    buffer_gl0_inv
1109; GFX1132-NEXT:    buffer_gl1_inv
1110; GFX1132-NEXT:  .LBB3_2:
1111; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1112; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1114; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1115; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1116; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
1117; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1118; GFX1132-NEXT:    s_mov_b32 s2, -1
1119; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
1120; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1121; GFX1132-NEXT:    s_endpgm
1122entry:
1123  %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel
1124  store i64 %old, i64 addrspace(1)* %out
1125  ret void
1126}
1127
1128define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) {
1129; GFX7LESS-LABEL: add_i64_uniform:
1130; GFX7LESS:       ; %bb.0: ; %entry
1131; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
1132; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1133; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
1134; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
1135; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
1136; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1137; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
1138; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1139; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_2
1140; GFX7LESS-NEXT:  ; %bb.1:
1141; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1142; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1143; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1144; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1145; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1146; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1147; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
1148; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1149; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
1150; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
1151; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
1152; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
1153; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1154; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
1155; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1156; GFX7LESS-NEXT:    buffer_wbinvl1
1157; GFX7LESS-NEXT:  .LBB4_2:
1158; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
1159; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1161; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1162; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
1163; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
1164; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
1165; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
1166; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
1167; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
1168; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
1169; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
1170; GFX7LESS-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
1171; GFX7LESS-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
1172; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1173; GFX7LESS-NEXT:    s_endpgm
1174;
1175; GFX8-LABEL: add_i64_uniform:
1176; GFX8:       ; %bb.0: ; %entry
1177; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1178; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
1179; GFX8-NEXT:    s_mov_b64 s[8:9], exec
1180; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1181; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1182; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1183; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
1184; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
1185; GFX8-NEXT:    s_cbranch_execz .LBB4_2
1186; GFX8-NEXT:  ; %bb.1:
1187; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1188; GFX8-NEXT:    s_mov_b32 s12, s6
1189; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1190; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1191; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
1192; GFX8-NEXT:    s_mul_i32 s6, s1, s6
1193; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1194; GFX8-NEXT:    s_mov_b32 s14, -1
1195; GFX8-NEXT:    s_mov_b32 s13, s7
1196; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
1197; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1198; GFX8-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
1199; GFX8-NEXT:    s_waitcnt vmcnt(0)
1200; GFX8-NEXT:    buffer_wbinvl1_vol
1201; GFX8-NEXT:  .LBB4_2:
1202; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
1203; GFX8-NEXT:    v_readfirstlane_b32 s2, v0
1204; GFX8-NEXT:    v_readfirstlane_b32 s3, v1
1205; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1206; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1207; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1208; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v2
1209; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1]
1210; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1211; GFX8-NEXT:    s_mov_b32 s6, -1
1212; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
1213; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1214; GFX8-NEXT:    s_endpgm
1215;
1216; GFX9-LABEL: add_i64_uniform:
1217; GFX9:       ; %bb.0: ; %entry
1218; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1219; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1220; GFX9-NEXT:    s_mov_b64 s[8:9], exec
1221; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1222; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1223; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1224; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
1225; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1226; GFX9-NEXT:    s_cbranch_execz .LBB4_2
1227; GFX9-NEXT:  ; %bb.1:
1228; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX9-NEXT:    s_mov_b32 s12, s6
1230; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
1231; GFX9-NEXT:    s_mov_b32 s13, s7
1232; GFX9-NEXT:    s_mul_i32 s7, s3, s6
1233; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
1234; GFX9-NEXT:    s_add_i32 s8, s8, s7
1235; GFX9-NEXT:    s_mul_i32 s6, s2, s6
1236; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1237; GFX9-NEXT:    s_mov_b32 s14, -1
1238; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1239; GFX9-NEXT:    v_mov_b32_e32 v1, s8
1240; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1241; GFX9-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
1242; GFX9-NEXT:    s_waitcnt vmcnt(0)
1243; GFX9-NEXT:    buffer_wbinvl1_vol
1244; GFX9-NEXT:  .LBB4_2:
1245; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1246; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1247; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
1248; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1249; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1251; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
1252; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1253; GFX9-NEXT:    s_mov_b32 s6, -1
1254; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
1255; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1256; GFX9-NEXT:    s_endpgm
1257;
1258; GFX1064-LABEL: add_i64_uniform:
1259; GFX1064:       ; %bb.0: ; %entry
1260; GFX1064-NEXT:    s_clause 0x1
1261; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1262; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1263; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
1264; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1265; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1266; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
1267; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1268; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1269; GFX1064-NEXT:    s_cbranch_execz .LBB4_2
1270; GFX1064-NEXT:  ; %bb.1:
1271; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
1272; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
1273; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1274; GFX1064-NEXT:    s_mul_i32 s9, s3, s8
1275; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
1276; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
1277; GFX1064-NEXT:    s_add_i32 s10, s10, s9
1278; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
1279; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
1280; GFX1064-NEXT:    s_mov_b32 s10, -1
1281; GFX1064-NEXT:    s_mov_b32 s8, s6
1282; GFX1064-NEXT:    s_mov_b32 s9, s7
1283; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1284; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1285; GFX1064-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1286; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1287; GFX1064-NEXT:    buffer_gl0_inv
1288; GFX1064-NEXT:    buffer_gl1_inv
1289; GFX1064-NEXT:  .LBB4_2:
1290; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1291; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1292; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
1293; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
1294; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1295; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1296; GFX1064-NEXT:    s_mov_b32 s6, -1
1297; GFX1064-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, s[0:1]
1298; GFX1064-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v2, v[1:2]
1299; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1300; GFX1064-NEXT:    s_endpgm
1301;
1302; GFX1032-LABEL: add_i64_uniform:
1303; GFX1032:       ; %bb.0: ; %entry
1304; GFX1032-NEXT:    s_clause 0x1
1305; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1306; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
1307; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
1308; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
1309; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
1310; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
1311; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1312; GFX1032-NEXT:    s_cbranch_execz .LBB4_2
1313; GFX1032-NEXT:  ; %bb.1:
1314; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s8
1315; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1316; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX1032-NEXT:    s_mul_i32 s8, s3, s1
1318; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
1319; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1320; GFX1032-NEXT:    s_add_i32 s9, s9, s8
1321; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
1322; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
1323; GFX1032-NEXT:    s_mov_b32 s10, -1
1324; GFX1032-NEXT:    s_mov_b32 s8, s6
1325; GFX1032-NEXT:    s_mov_b32 s9, s7
1326; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1327; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1328; GFX1032-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1329; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1330; GFX1032-NEXT:    buffer_gl0_inv
1331; GFX1032-NEXT:    buffer_gl1_inv
1332; GFX1032-NEXT:  .LBB4_2:
1333; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1334; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1335; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
1336; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
1337; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1338; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1339; GFX1032-NEXT:    s_mov_b32 s6, -1
1340; GFX1032-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v2, s[0:1]
1341; GFX1032-NEXT:    v_mad_u64_u32 v[1:2], s0, s3, v2, v[1:2]
1342; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1343; GFX1032-NEXT:    s_endpgm
1344;
1345; GFX1164-LABEL: add_i64_uniform:
1346; GFX1164:       ; %bb.0: ; %entry
1347; GFX1164-NEXT:    s_clause 0x1
1348; GFX1164-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1349; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1350; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
1351; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1352; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
1353; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1354; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
1355; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
1356; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
1357; GFX1164-NEXT:    s_cbranch_execz .LBB4_2
1358; GFX1164-NEXT:  ; %bb.1:
1359; GFX1164-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
1360; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
1361; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1362; GFX1164-NEXT:    s_mul_i32 s9, s1, s8
1363; GFX1164-NEXT:    s_mul_hi_u32 s10, s0, s8
1364; GFX1164-NEXT:    s_mul_i32 s8, s0, s8
1365; GFX1164-NEXT:    s_add_i32 s10, s10, s9
1366; GFX1164-NEXT:    v_mov_b32_e32 v0, s8
1367; GFX1164-NEXT:    v_mov_b32_e32 v1, s10
1368; GFX1164-NEXT:    s_mov_b32 s10, -1
1369; GFX1164-NEXT:    s_mov_b32 s8, s6
1370; GFX1164-NEXT:    s_mov_b32 s9, s7
1371; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1372; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1373; GFX1164-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1374; GFX1164-NEXT:    s_waitcnt vmcnt(0)
1375; GFX1164-NEXT:    buffer_gl0_inv
1376; GFX1164-NEXT:    buffer_gl1_inv
1377; GFX1164-NEXT:  .LBB4_2:
1378; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
1379; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
1380; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
1381; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1382; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
1383; GFX1164-NEXT:    s_mov_b32 s6, -1
1384; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1385; GFX1164-NEXT:    v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
1386; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
1387; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1388; GFX1164-NEXT:    v_mov_b32_e32 v1, v3
1389; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1390; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1391; GFX1164-NEXT:    s_endpgm
1392;
1393; GFX1132-LABEL: add_i64_uniform:
1394; GFX1132:       ; %bb.0: ; %entry
1395; GFX1132-NEXT:    s_clause 0x1
1396; GFX1132-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1397; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
1398; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
1399; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
1400; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
1401; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
1402; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1403; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
1404; GFX1132-NEXT:    s_cbranch_execz .LBB4_2
1405; GFX1132-NEXT:  ; %bb.1:
1406; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
1407; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
1408; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX1132-NEXT:    s_mul_i32 s8, s1, s3
1410; GFX1132-NEXT:    s_mul_hi_u32 s9, s0, s3
1411; GFX1132-NEXT:    s_mul_i32 s3, s0, s3
1412; GFX1132-NEXT:    s_add_i32 s9, s9, s8
1413; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1414; GFX1132-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
1415; GFX1132-NEXT:    s_mov_b32 s10, -1
1416; GFX1132-NEXT:    s_mov_b32 s8, s6
1417; GFX1132-NEXT:    s_mov_b32 s9, s7
1418; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1419; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1420; GFX1132-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1421; GFX1132-NEXT:    s_waitcnt vmcnt(0)
1422; GFX1132-NEXT:    buffer_gl0_inv
1423; GFX1132-NEXT:    buffer_gl1_inv
1424; GFX1132-NEXT:  .LBB4_2:
1425; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
1426; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
1427; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
1428; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1429; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
1430; GFX1132-NEXT:    s_mov_b32 s6, -1
1431; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1432; GFX1132-NEXT:    v_mad_u64_u32 v[0:1], null, s0, v2, s[2:3]
1433; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s1, v2, v[1:2]
1434; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1435; GFX1132-NEXT:    v_mov_b32_e32 v1, v3
1436; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1437; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1438; GFX1132-NEXT:    s_endpgm
1439entry:
1440  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel
1441  store i64 %old, i64 addrspace(1)* %out
1442  ret void
1443}
1444
1445define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
1446; GFX7LESS-LABEL: add_i64_varying:
1447; GFX7LESS:       ; %bb.0: ; %entry
1448; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1449; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1450; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1451; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
1452; GFX7LESS-NEXT:    s_mov_b32 s10, s6
1453; GFX7LESS-NEXT:    s_mov_b32 s11, s7
1454; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1455; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1456; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1457; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1458; GFX7LESS-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1459; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1460; GFX7LESS-NEXT:    buffer_wbinvl1
1461; GFX7LESS-NEXT:    s_mov_b32 s4, s0
1462; GFX7LESS-NEXT:    s_mov_b32 s5, s1
1463; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1464; GFX7LESS-NEXT:    s_endpgm
1465;
1466; GFX89-LABEL: add_i64_varying:
1467; GFX89:       ; %bb.0: ; %entry
1468; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1469; GFX89-NEXT:    s_mov_b32 s7, 0xf000
1470; GFX89-NEXT:    s_mov_b32 s6, -1
1471; GFX89-NEXT:    s_mov_b32 s10, s6
1472; GFX89-NEXT:    s_mov_b32 s11, s7
1473; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
1474; GFX89-NEXT:    s_mov_b32 s8, s2
1475; GFX89-NEXT:    s_mov_b32 s9, s3
1476; GFX89-NEXT:    v_mov_b32_e32 v1, 0
1477; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1478; GFX89-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1479; GFX89-NEXT:    s_waitcnt vmcnt(0)
1480; GFX89-NEXT:    buffer_wbinvl1_vol
1481; GFX89-NEXT:    s_mov_b32 s4, s0
1482; GFX89-NEXT:    s_mov_b32 s5, s1
1483; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1484; GFX89-NEXT:    s_endpgm
1485;
1486; GFX10-LABEL: add_i64_varying:
1487; GFX10:       ; %bb.0: ; %entry
1488; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1489; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1490; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
1491; GFX10-NEXT:    s_mov_b32 s6, -1
1492; GFX10-NEXT:    s_mov_b32 s11, s7
1493; GFX10-NEXT:    s_mov_b32 s10, s6
1494; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1495; GFX10-NEXT:    s_mov_b32 s8, s2
1496; GFX10-NEXT:    s_mov_b32 s9, s3
1497; GFX10-NEXT:    s_mov_b32 s4, s0
1498; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1499; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1500; GFX10-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc
1501; GFX10-NEXT:    s_waitcnt vmcnt(0)
1502; GFX10-NEXT:    buffer_gl0_inv
1503; GFX10-NEXT:    buffer_gl1_inv
1504; GFX10-NEXT:    s_mov_b32 s5, s1
1505; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1506; GFX10-NEXT:    s_endpgm
1507;
1508; GFX11-LABEL: add_i64_varying:
1509; GFX11:       ; %bb.0: ; %entry
1510; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1511; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1512; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
1513; GFX11-NEXT:    s_mov_b32 s6, -1
1514; GFX11-NEXT:    s_mov_b32 s11, s7
1515; GFX11-NEXT:    s_mov_b32 s10, s6
1516; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1517; GFX11-NEXT:    s_mov_b32 s8, s2
1518; GFX11-NEXT:    s_mov_b32 s9, s3
1519; GFX11-NEXT:    s_mov_b32 s4, s0
1520; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1521; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1522; GFX11-NEXT:    buffer_atomic_add_u64 v[0:1], off, s[8:11], 0 glc
1523; GFX11-NEXT:    s_waitcnt vmcnt(0)
1524; GFX11-NEXT:    buffer_gl0_inv
1525; GFX11-NEXT:    buffer_gl1_inv
1526; GFX11-NEXT:    s_mov_b32 s5, s1
1527; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
1528; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1529; GFX11-NEXT:    s_endpgm
1530entry:
1531  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1532  %zext = zext i32 %lane to i64
1533  %old = atomicrmw add i64 addrspace(1)* %inout, i64 %zext acq_rel
1534  store i64 %old, i64 addrspace(1)* %out
1535  ret void
1536}
1537
1538define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
1539; GFX7LESS-LABEL: sub_i32_constant:
1540; GFX7LESS:       ; %bb.0: ; %entry
1541; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
1542; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
1543; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
1544; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
1545; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1546; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1547; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1548; GFX7LESS-NEXT:    s_cbranch_execz .LBB6_2
1549; GFX7LESS-NEXT:  ; %bb.1:
1550; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
1551; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1552; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
1553; GFX7LESS-NEXT:    s_mov_b32 s10, -1
1554; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1555; GFX7LESS-NEXT:    s_mov_b32 s8, s2
1556; GFX7LESS-NEXT:    s_mov_b32 s9, s3
1557; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s6
1558; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1559; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1560; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1561; GFX7LESS-NEXT:    buffer_wbinvl1
1562; GFX7LESS-NEXT:  .LBB6_2:
1563; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
1564; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1565; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
1566; GFX7LESS-NEXT:    s_mov_b32 s2, -1
1567; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v1
1568; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1569; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
1570; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1571; GFX7LESS-NEXT:    s_endpgm
1572;
1573; GFX8-LABEL: sub_i32_constant:
1574; GFX8:       ; %bb.0: ; %entry
1575; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1576; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1577; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1578; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1579; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1580; GFX8-NEXT:    ; implicit-def: $vgpr1
1581; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1582; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1583; GFX8-NEXT:  ; %bb.1:
1584; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1585; GFX8-NEXT:    s_mov_b32 s8, s2
1586; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1587; GFX8-NEXT:    s_mul_i32 s2, s2, 5
1588; GFX8-NEXT:    s_mov_b32 s11, 0xf000
1589; GFX8-NEXT:    s_mov_b32 s10, -1
1590; GFX8-NEXT:    s_mov_b32 s9, s3
1591; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1592; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1593; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1594; GFX8-NEXT:    s_waitcnt vmcnt(0)
1595; GFX8-NEXT:    buffer_wbinvl1_vol
1596; GFX8-NEXT:  .LBB6_2:
1597; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1598; GFX8-NEXT:    v_readfirstlane_b32 s4, v1
1599; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1600; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1601; GFX8-NEXT:    s_mov_b32 s3, 0xf000
1602; GFX8-NEXT:    s_mov_b32 s2, -1
1603; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
1604; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1605; GFX8-NEXT:    s_endpgm
1606;
1607; GFX9-LABEL: sub_i32_constant:
1608; GFX9:       ; %bb.0: ; %entry
1609; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1610; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1611; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1612; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1613; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1614; GFX9-NEXT:    ; implicit-def: $vgpr1
1615; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1616; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1617; GFX9-NEXT:  ; %bb.1:
1618; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX9-NEXT:    s_mov_b32 s8, s2
1620; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
1621; GFX9-NEXT:    s_mul_i32 s2, s2, 5
1622; GFX9-NEXT:    s_mov_b32 s11, 0xf000
1623; GFX9-NEXT:    s_mov_b32 s10, -1
1624; GFX9-NEXT:    s_mov_b32 s9, s3
1625; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1626; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1627; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1628; GFX9-NEXT:    s_waitcnt vmcnt(0)
1629; GFX9-NEXT:    buffer_wbinvl1_vol
1630; GFX9-NEXT:  .LBB6_2:
1631; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1632; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
1633; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1634; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1635; GFX9-NEXT:    s_mov_b32 s3, 0xf000
1636; GFX9-NEXT:    s_mov_b32 s2, -1
1637; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
1638; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1639; GFX9-NEXT:    s_endpgm
1640;
1641; GFX1064-LABEL: sub_i32_constant:
1642; GFX1064:       ; %bb.0: ; %entry
1643; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1644; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
1645; GFX1064-NEXT:    ; implicit-def: $vgpr1
1646; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1647; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1648; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1649; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1650; GFX1064-NEXT:    s_cbranch_execz .LBB6_2
1651; GFX1064-NEXT:  ; %bb.1:
1652; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1653; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
1654; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
1655; GFX1064-NEXT:    s_mov_b32 s10, -1
1656; GFX1064-NEXT:    v_mov_b32_e32 v1, s6
1657; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1658; GFX1064-NEXT:    s_mov_b32 s8, s2
1659; GFX1064-NEXT:    s_mov_b32 s9, s3
1660; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1661; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1662; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1663; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1664; GFX1064-NEXT:    buffer_gl0_inv
1665; GFX1064-NEXT:    buffer_gl1_inv
1666; GFX1064-NEXT:  .LBB6_2:
1667; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1668; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
1669; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1670; GFX1064-NEXT:    v_readfirstlane_b32 s2, v1
1671; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1672; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
1673; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1674; GFX1064-NEXT:    s_mov_b32 s2, -1
1675; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1676; GFX1064-NEXT:    s_endpgm
1677;
1678; GFX1032-LABEL: sub_i32_constant:
1679; GFX1032:       ; %bb.0: ; %entry
1680; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
1681; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
1682; GFX1032-NEXT:    ; implicit-def: $vgpr1
1683; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1684; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1685; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1686; GFX1032-NEXT:    s_cbranch_execz .LBB6_2
1687; GFX1032-NEXT:  ; %bb.1:
1688; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
1689; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1690; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
1691; GFX1032-NEXT:    s_mov_b32 s10, -1
1692; GFX1032-NEXT:    v_mov_b32_e32 v1, s5
1693; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1694; GFX1032-NEXT:    s_mov_b32 s8, s2
1695; GFX1032-NEXT:    s_mov_b32 s9, s3
1696; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1697; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1698; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1699; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1700; GFX1032-NEXT:    buffer_gl0_inv
1701; GFX1032-NEXT:    buffer_gl1_inv
1702; GFX1032-NEXT:  .LBB6_2:
1703; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1704; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1705; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1706; GFX1032-NEXT:    v_readfirstlane_b32 s2, v1
1707; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1708; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
1709; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1710; GFX1032-NEXT:    s_mov_b32 s2, -1
1711; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1712; GFX1032-NEXT:    s_endpgm
1713;
1714; GFX1164-LABEL: sub_i32_constant:
1715; GFX1164:       ; %bb.0: ; %entry
1716; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1717; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
1718; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
1719; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1720; GFX1164-NEXT:    ; implicit-def: $vgpr1
1721; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1722; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1723; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1724; GFX1164-NEXT:    s_cbranch_execz .LBB6_2
1725; GFX1164-NEXT:  ; %bb.1:
1726; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
1727; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
1728; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
1729; GFX1164-NEXT:    s_mov_b32 s10, -1
1730; GFX1164-NEXT:    v_mov_b32_e32 v1, s6
1731; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1732; GFX1164-NEXT:    s_mov_b32 s8, s2
1733; GFX1164-NEXT:    s_mov_b32 s9, s3
1734; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1735; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
1736; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1737; GFX1164-NEXT:    s_waitcnt vmcnt(0)
1738; GFX1164-NEXT:    buffer_gl0_inv
1739; GFX1164-NEXT:    buffer_gl1_inv
1740; GFX1164-NEXT:  .LBB6_2:
1741; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
1742; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1743; GFX1164-NEXT:    v_readfirstlane_b32 s2, v1
1744; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1745; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
1746; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1747; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1748; GFX1164-NEXT:    s_mov_b32 s2, -1
1749; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1750; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1751; GFX1164-NEXT:    s_endpgm
1752;
1753; GFX1132-LABEL: sub_i32_constant:
1754; GFX1132:       ; %bb.0: ; %entry
1755; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
1756; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
1757; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
1758; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1759; GFX1132-NEXT:    ; implicit-def: $vgpr1
1760; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1761; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
1762; GFX1132-NEXT:    s_cbranch_execz .LBB6_2
1763; GFX1132-NEXT:  ; %bb.1:
1764; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
1765; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
1766; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
1767; GFX1132-NEXT:    s_mov_b32 s10, -1
1768; GFX1132-NEXT:    v_mov_b32_e32 v1, s5
1769; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1770; GFX1132-NEXT:    s_mov_b32 s8, s2
1771; GFX1132-NEXT:    s_mov_b32 s9, s3
1772; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1773; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
1774; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1775; GFX1132-NEXT:    s_waitcnt vmcnt(0)
1776; GFX1132-NEXT:    buffer_gl0_inv
1777; GFX1132-NEXT:    buffer_gl1_inv
1778; GFX1132-NEXT:  .LBB6_2:
1779; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1780; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
1781; GFX1132-NEXT:    v_readfirstlane_b32 s2, v1
1782; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1783; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
1784; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1785; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
1786; GFX1132-NEXT:    s_mov_b32 s2, -1
1787; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
1788; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1789; GFX1132-NEXT:    s_endpgm
1790entry:
1791  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel
1792  store i32 %old, i32 addrspace(1)* %out
1793  ret void
1794}
1795
1796define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) {
1797; GFX7LESS-LABEL: sub_i32_uniform:
1798; GFX7LESS:       ; %bb.0: ; %entry
1799; GFX7LESS-NEXT:    s_mov_b64 s[2:3], exec
1800; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
1801; GFX7LESS-NEXT:    s_load_dword s8, s[0:1], 0xd
1802; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1803; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1804; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1805; GFX7LESS-NEXT:    ; implicit-def: $vgpr1
1806; GFX7LESS-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1807; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_2
1808; GFX7LESS-NEXT:  ; %bb.1:
1809; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
1810; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1811; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1812; GFX7LESS-NEXT:    s_mul_i32 s2, s8, s2
1813; GFX7LESS-NEXT:    s_mov_b32 s14, -1
1814; GFX7LESS-NEXT:    s_mov_b32 s12, s6
1815; GFX7LESS-NEXT:    s_mov_b32 s13, s7
1816; GFX7LESS-NEXT:    v_mov_b32_e32 v1, s2
1817; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1818; GFX7LESS-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1819; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
1820; GFX7LESS-NEXT:    buffer_wbinvl1
1821; GFX7LESS-NEXT:  .LBB7_2:
1822; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[0:1]
1823; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
1824; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
1825; GFX7LESS-NEXT:    s_mov_b32 s6, -1
1826; GFX7LESS-NEXT:    v_readfirstlane_b32 s0, v1
1827; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s8, v0
1828; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1829; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1830; GFX7LESS-NEXT:    s_endpgm
1831;
1832; GFX8-LABEL: sub_i32_uniform:
1833; GFX8:       ; %bb.0: ; %entry
1834; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1835; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x34
1836; GFX8-NEXT:    s_mov_b64 s[2:3], exec
1837; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1838; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1839; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1840; GFX8-NEXT:    ; implicit-def: $vgpr1
1841; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1842; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1843; GFX8-NEXT:  ; %bb.1:
1844; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1845; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1846; GFX8-NEXT:    s_mul_i32 s2, s8, s2
1847; GFX8-NEXT:    s_mov_b32 s15, 0xf000
1848; GFX8-NEXT:    s_mov_b32 s14, -1
1849; GFX8-NEXT:    s_mov_b32 s12, s6
1850; GFX8-NEXT:    s_mov_b32 s13, s7
1851; GFX8-NEXT:    v_mov_b32_e32 v1, s2
1852; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1853; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1854; GFX8-NEXT:    s_waitcnt vmcnt(0)
1855; GFX8-NEXT:    buffer_wbinvl1_vol
1856; GFX8-NEXT:  .LBB7_2:
1857; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
1858; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1859; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1860; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1861; GFX8-NEXT:    s_mov_b32 s7, 0xf000
1862; GFX8-NEXT:    s_mov_b32 s6, -1
1863; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1864; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1865; GFX8-NEXT:    s_endpgm
1866;
1867; GFX9-LABEL: sub_i32_uniform:
1868; GFX9:       ; %bb.0: ; %entry
1869; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1870; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x34
1871; GFX9-NEXT:    s_mov_b64 s[2:3], exec
1872; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1873; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1874; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1875; GFX9-NEXT:    ; implicit-def: $vgpr1
1876; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1877; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1878; GFX9-NEXT:  ; %bb.1:
1879; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1880; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1881; GFX9-NEXT:    s_mul_i32 s2, s8, s2
1882; GFX9-NEXT:    s_mov_b32 s15, 0xf000
1883; GFX9-NEXT:    s_mov_b32 s14, -1
1884; GFX9-NEXT:    s_mov_b32 s12, s6
1885; GFX9-NEXT:    s_mov_b32 s13, s7
1886; GFX9-NEXT:    v_mov_b32_e32 v1, s2
1887; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1888; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1889; GFX9-NEXT:    s_waitcnt vmcnt(0)
1890; GFX9-NEXT:    buffer_wbinvl1_vol
1891; GFX9-NEXT:  .LBB7_2:
1892; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
1893; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1894; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1895; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1896; GFX9-NEXT:    s_mov_b32 s7, 0xf000
1897; GFX9-NEXT:    s_mov_b32 s6, -1
1898; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1899; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1900; GFX9-NEXT:    s_endpgm
1901;
1902; GFX1064-LABEL: sub_i32_uniform:
1903; GFX1064:       ; %bb.0: ; %entry
1904; GFX1064-NEXT:    s_clause 0x1
1905; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1906; GFX1064-NEXT:    s_load_dword s8, s[0:1], 0x34
1907; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
1908; GFX1064-NEXT:    ; implicit-def: $vgpr1
1909; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1910; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1911; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1912; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1913; GFX1064-NEXT:    s_cbranch_execz .LBB7_2
1914; GFX1064-NEXT:  ; %bb.1:
1915; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1916; GFX1064-NEXT:    s_mov_b32 s15, 0x31016000
1917; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1918; GFX1064-NEXT:    s_mul_i32 s2, s8, s2
1919; GFX1064-NEXT:    s_mov_b32 s14, -1
1920; GFX1064-NEXT:    v_mov_b32_e32 v1, s2
1921; GFX1064-NEXT:    s_mov_b32 s12, s6
1922; GFX1064-NEXT:    s_mov_b32 s13, s7
1923; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1924; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
1925; GFX1064-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1926; GFX1064-NEXT:    s_waitcnt vmcnt(0)
1927; GFX1064-NEXT:    buffer_gl0_inv
1928; GFX1064-NEXT:    buffer_gl1_inv
1929; GFX1064-NEXT:  .LBB7_2:
1930; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
1931; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
1932; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
1933; GFX1064-NEXT:    v_mul_lo_u32 v0, s8, v0
1934; GFX1064-NEXT:    v_readfirstlane_b32 s0, v1
1935; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
1936; GFX1064-NEXT:    s_mov_b32 s6, -1
1937; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1938; GFX1064-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1939; GFX1064-NEXT:    s_endpgm
1940;
1941; GFX1032-LABEL: sub_i32_uniform:
1942; GFX1032:       ; %bb.0: ; %entry
1943; GFX1032-NEXT:    s_clause 0x1
1944; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
1945; GFX1032-NEXT:    s_load_dword s2, s[0:1], 0x34
1946; GFX1032-NEXT:    s_mov_b32 s3, exec_lo
1947; GFX1032-NEXT:    ; implicit-def: $vgpr1
1948; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
1949; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1950; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
1951; GFX1032-NEXT:    s_cbranch_execz .LBB7_2
1952; GFX1032-NEXT:  ; %bb.1:
1953; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s3
1954; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
1955; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1956; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
1957; GFX1032-NEXT:    s_mov_b32 s10, -1
1958; GFX1032-NEXT:    v_mov_b32_e32 v1, s1
1959; GFX1032-NEXT:    s_mov_b32 s8, s6
1960; GFX1032-NEXT:    s_mov_b32 s9, s7
1961; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1962; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
1963; GFX1032-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1964; GFX1032-NEXT:    s_waitcnt vmcnt(0)
1965; GFX1032-NEXT:    buffer_gl0_inv
1966; GFX1032-NEXT:    buffer_gl1_inv
1967; GFX1032-NEXT:  .LBB7_2:
1968; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
1969; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
1970; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
1971; GFX1032-NEXT:    v_mul_lo_u32 v0, s2, v0
1972; GFX1032-NEXT:    v_readfirstlane_b32 s0, v1
1973; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
1974; GFX1032-NEXT:    s_mov_b32 s6, -1
1975; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1976; GFX1032-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1977; GFX1032-NEXT:    s_endpgm
1978;
1979; GFX1164-LABEL: sub_i32_uniform:
1980; GFX1164:       ; %bb.0: ; %entry
1981; GFX1164-NEXT:    s_clause 0x1
1982; GFX1164-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
1983; GFX1164-NEXT:    s_load_b32 s8, s[0:1], 0x34
1984; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
1985; GFX1164-NEXT:    s_mov_b64 s[0:1], exec
1986; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
1987; GFX1164-NEXT:    ; implicit-def: $vgpr1
1988; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1989; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
1990; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
1991; GFX1164-NEXT:    s_cbranch_execz .LBB7_2
1992; GFX1164-NEXT:  ; %bb.1:
1993; GFX1164-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
1994; GFX1164-NEXT:    s_mov_b32 s15, 0x31016000
1995; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
1996; GFX1164-NEXT:    s_mul_i32 s2, s8, s2
1997; GFX1164-NEXT:    s_mov_b32 s14, -1
1998; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
1999; GFX1164-NEXT:    s_mov_b32 s12, s6
2000; GFX1164-NEXT:    s_mov_b32 s13, s7
2001; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2002; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2003; GFX1164-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
2004; GFX1164-NEXT:    s_waitcnt vmcnt(0)
2005; GFX1164-NEXT:    buffer_gl0_inv
2006; GFX1164-NEXT:    buffer_gl1_inv
2007; GFX1164-NEXT:  .LBB7_2:
2008; GFX1164-NEXT:    s_or_b64 exec, exec, s[0:1]
2009; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2010; GFX1164-NEXT:    v_mul_lo_u32 v0, s8, v0
2011; GFX1164-NEXT:    v_readfirstlane_b32 s0, v1
2012; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2013; GFX1164-NEXT:    s_mov_b32 s6, -1
2014; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2015; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2016; GFX1164-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2017; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2018; GFX1164-NEXT:    s_endpgm
2019;
2020; GFX1132-LABEL: sub_i32_uniform:
2021; GFX1132:       ; %bb.0: ; %entry
2022; GFX1132-NEXT:    s_clause 0x1
2023; GFX1132-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2024; GFX1132-NEXT:    s_load_b32 s0, s[0:1], 0x34
2025; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
2026; GFX1132-NEXT:    s_mov_b32 s1, exec_lo
2027; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
2028; GFX1132-NEXT:    ; implicit-def: $vgpr1
2029; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2030; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
2031; GFX1132-NEXT:    s_cbranch_execz .LBB7_2
2032; GFX1132-NEXT:  ; %bb.1:
2033; GFX1132-NEXT:    s_bcnt1_i32_b32 s2, s2
2034; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
2035; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2036; GFX1132-NEXT:    s_mul_i32 s2, s0, s2
2037; GFX1132-NEXT:    s_mov_b32 s10, -1
2038; GFX1132-NEXT:    v_mov_b32_e32 v1, s2
2039; GFX1132-NEXT:    s_mov_b32 s8, s6
2040; GFX1132-NEXT:    s_mov_b32 s9, s7
2041; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2042; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2043; GFX1132-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
2044; GFX1132-NEXT:    s_waitcnt vmcnt(0)
2045; GFX1132-NEXT:    buffer_gl0_inv
2046; GFX1132-NEXT:    buffer_gl1_inv
2047; GFX1132-NEXT:  .LBB7_2:
2048; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s1
2049; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2050; GFX1132-NEXT:    v_mul_lo_u32 v0, s0, v0
2051; GFX1132-NEXT:    v_readfirstlane_b32 s0, v1
2052; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2053; GFX1132-NEXT:    s_mov_b32 s6, -1
2054; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2055; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
2056; GFX1132-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
2057; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2058; GFX1132-NEXT:    s_endpgm
2059entry:
2060  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel
2061  store i32 %old, i32 addrspace(1)* %out
2062  ret void
2063}
2064
2065define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) {
2066; GFX7LESS-LABEL: sub_i32_varying:
2067; GFX7LESS:       ; %bb.0: ; %entry
2068; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2069; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2070; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2071; GFX7LESS-NEXT:    s_mov_b32 s10, s6
2072; GFX7LESS-NEXT:    s_mov_b32 s11, s7
2073; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2074; GFX7LESS-NEXT:    s_mov_b32 s8, s2
2075; GFX7LESS-NEXT:    s_mov_b32 s9, s3
2076; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2077; GFX7LESS-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2078; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
2079; GFX7LESS-NEXT:    buffer_wbinvl1
2080; GFX7LESS-NEXT:    s_mov_b32 s4, s0
2081; GFX7LESS-NEXT:    s_mov_b32 s5, s1
2082; GFX7LESS-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2083; GFX7LESS-NEXT:    s_endpgm
2084;
2085; GFX8-LABEL: sub_i32_varying:
2086; GFX8:       ; %bb.0: ; %entry
2087; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2088; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2089; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2090; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2091; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2092; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2093; GFX8-NEXT:    v_mov_b32_e32 v2, v0
2094; GFX8-NEXT:    s_not_b64 exec, exec
2095; GFX8-NEXT:    v_mov_b32_e32 v2, 0
2096; GFX8-NEXT:    s_not_b64 exec, exec
2097; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
2098; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2099; GFX8-NEXT:    s_nop 1
2100; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2101; GFX8-NEXT:    s_nop 1
2102; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2103; GFX8-NEXT:    s_nop 1
2104; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2105; GFX8-NEXT:    s_nop 1
2106; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2107; GFX8-NEXT:    s_nop 1
2108; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2109; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
2110; GFX8-NEXT:    s_nop 0
2111; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2112; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
2113; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2114; GFX8-NEXT:    ; implicit-def: $vgpr0
2115; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2116; GFX8-NEXT:    s_cbranch_execz .LBB8_2
2117; GFX8-NEXT:  ; %bb.1:
2118; GFX8-NEXT:    s_mov_b32 s11, 0xf000
2119; GFX8-NEXT:    s_mov_b32 s10, -1
2120; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2121; GFX8-NEXT:    s_mov_b32 s8, s2
2122; GFX8-NEXT:    s_mov_b32 s9, s3
2123; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2124; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2125; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2126; GFX8-NEXT:    s_waitcnt vmcnt(0)
2127; GFX8-NEXT:    buffer_wbinvl1_vol
2128; GFX8-NEXT:  .LBB8_2:
2129; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2130; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
2131; GFX8-NEXT:    v_mov_b32_e32 v0, v1
2132; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2133; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2134; GFX8-NEXT:    s_mov_b32 s2, -1
2135; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
2136; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2137; GFX8-NEXT:    s_endpgm
2138;
2139; GFX9-LABEL: sub_i32_varying:
2140; GFX9:       ; %bb.0: ; %entry
2141; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2142; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2143; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2144; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2145; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
2146; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
2147; GFX9-NEXT:    v_mov_b32_e32 v2, v0
2148; GFX9-NEXT:    s_not_b64 exec, exec
2149; GFX9-NEXT:    v_mov_b32_e32 v2, 0
2150; GFX9-NEXT:    s_not_b64 exec, exec
2151; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
2152; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2153; GFX9-NEXT:    s_nop 1
2154; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2155; GFX9-NEXT:    s_nop 1
2156; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2157; GFX9-NEXT:    s_nop 1
2158; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2159; GFX9-NEXT:    s_nop 1
2160; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
2161; GFX9-NEXT:    s_nop 1
2162; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
2163; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
2164; GFX9-NEXT:    s_nop 0
2165; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
2166; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
2167; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
2168; GFX9-NEXT:    ; implicit-def: $vgpr0
2169; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2170; GFX9-NEXT:    s_cbranch_execz .LBB8_2
2171; GFX9-NEXT:  ; %bb.1:
2172; GFX9-NEXT:    s_mov_b32 s11, 0xf000
2173; GFX9-NEXT:    s_mov_b32 s10, -1
2174; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2175; GFX9-NEXT:    s_mov_b32 s8, s2
2176; GFX9-NEXT:    s_mov_b32 s9, s3
2177; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2178; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2179; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
2180; GFX9-NEXT:    s_waitcnt vmcnt(0)
2181; GFX9-NEXT:    buffer_wbinvl1_vol
2182; GFX9-NEXT:  .LBB8_2:
2183; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2184; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
2185; GFX9-NEXT:    v_mov_b32_e32 v0, v1
2186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2187; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2188; GFX9-NEXT:    s_mov_b32 s2, -1
2189; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
2190; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2191; GFX9-NEXT:    s_endpgm
2192;
2193; GFX1064-LABEL: sub_i32_varying:
2194; GFX1064:       ; %bb.0: ; %entry
2195; GFX1064-NEXT:    v_mov_b32_e32 v1, v0
2196; GFX1064-NEXT:    s_not_b64 exec, exec
2197; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2198; GFX1064-NEXT:    s_not_b64 exec, exec
2199; GFX1064-NEXT:    s_or_saveexec_b64 s[2:3], -1
2200; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2201; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
2202; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2203; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2204; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2205; GFX1064-NEXT:    v_mov_b32_e32 v2, v1
2206; GFX1064-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2207; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2208; GFX1064-NEXT:    v_readlane_b32 s4, v1, 31
2209; GFX1064-NEXT:    v_mov_b32_e32 v2, s4
2210; GFX1064-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2211; GFX1064-NEXT:    v_readlane_b32 s6, v1, 15
2212; GFX1064-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2213; GFX1064-NEXT:    s_mov_b64 exec, s[2:3]
2214; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2215; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2216; GFX1064-NEXT:    v_readlane_b32 s7, v1, 31
2217; GFX1064-NEXT:    v_writelane_b32 v3, s6, 16
2218; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2219; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2220; GFX1064-NEXT:    s_or_saveexec_b64 s[4:5], -1
2221; GFX1064-NEXT:    v_readlane_b32 s8, v1, 47
2222; GFX1064-NEXT:    v_readlane_b32 s9, v1, 63
2223; GFX1064-NEXT:    v_writelane_b32 v3, s7, 32
2224; GFX1064-NEXT:    s_mov_b64 exec, s[4:5]
2225; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2226; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
2227; GFX1064-NEXT:    s_mov_b32 s4, s9
2228; GFX1064-NEXT:    v_writelane_b32 v3, s8, 48
2229; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
2230; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2231; GFX1064-NEXT:    s_mov_b32 s6, -1
2232; GFX1064-NEXT:    ; implicit-def: $vgpr0
2233; GFX1064-NEXT:    s_and_saveexec_b64 s[8:9], vcc
2234; GFX1064-NEXT:    s_cbranch_execz .LBB8_2
2235; GFX1064-NEXT:  ; %bb.1:
2236; GFX1064-NEXT:    v_mov_b32_e32 v0, s4
2237; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2238; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2239; GFX1064-NEXT:    s_mov_b32 s4, s2
2240; GFX1064-NEXT:    s_mov_b32 s5, s3
2241; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2242; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2243; GFX1064-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
2244; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2245; GFX1064-NEXT:    buffer_gl0_inv
2246; GFX1064-NEXT:    buffer_gl1_inv
2247; GFX1064-NEXT:  .LBB8_2:
2248; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2249; GFX1064-NEXT:    s_or_b64 exec, exec, s[8:9]
2250; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2251; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2252; GFX1064-NEXT:    v_mov_b32_e32 v0, v3
2253; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2254; GFX1064-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2255; GFX1064-NEXT:    s_mov_b32 s2, s6
2256; GFX1064-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2257; GFX1064-NEXT:    s_endpgm
2258;
2259; GFX1032-LABEL: sub_i32_varying:
2260; GFX1032:       ; %bb.0: ; %entry
2261; GFX1032-NEXT:    v_mov_b32_e32 v1, v0
2262; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2263; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2264; GFX1032-NEXT:    s_not_b32 exec_lo, exec_lo
2265; GFX1032-NEXT:    s_or_saveexec_b32 s2, -1
2266; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2267; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2268; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2269; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2270; GFX1032-NEXT:    v_mov_b32_e32 v2, v1
2271; GFX1032-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2272; GFX1032-NEXT:    s_mov_b32 exec_lo, s2
2273; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2274; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2275; GFX1032-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2276; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
2277; GFX1032-NEXT:    v_readlane_b32 s5, v1, 15
2278; GFX1032-NEXT:    v_readlane_b32 s6, v1, 31
2279; GFX1032-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2280; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2281; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2282; GFX1032-NEXT:    s_or_saveexec_b32 s4, -1
2283; GFX1032-NEXT:    v_writelane_b32 v3, s5, 16
2284; GFX1032-NEXT:    s_mov_b32 exec_lo, s4
2285; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2286; GFX1032-NEXT:    s_mov_b32 s4, s6
2287; GFX1032-NEXT:    s_mov_b32 s6, -1
2288; GFX1032-NEXT:    ; implicit-def: $vgpr0
2289; GFX1032-NEXT:    s_and_saveexec_b32 s8, vcc_lo
2290; GFX1032-NEXT:    s_cbranch_execz .LBB8_2
2291; GFX1032-NEXT:  ; %bb.1:
2292; GFX1032-NEXT:    v_mov_b32_e32 v0, s4
2293; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2294; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2295; GFX1032-NEXT:    s_mov_b32 s4, s2
2296; GFX1032-NEXT:    s_mov_b32 s5, s3
2297; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2298; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2299; GFX1032-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
2300; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2301; GFX1032-NEXT:    buffer_gl0_inv
2302; GFX1032-NEXT:    buffer_gl1_inv
2303; GFX1032-NEXT:  .LBB8_2:
2304; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2305; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s8
2306; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2307; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2308; GFX1032-NEXT:    v_mov_b32_e32 v0, v3
2309; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2310; GFX1032-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2311; GFX1032-NEXT:    s_mov_b32 s2, s6
2312; GFX1032-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2313; GFX1032-NEXT:    s_endpgm
2314;
2315; GFX1164-LABEL: sub_i32_varying:
2316; GFX1164:       ; %bb.0: ; %entry
2317; GFX1164-NEXT:    v_mov_b32_e32 v1, v0
2318; GFX1164-NEXT:    s_not_b64 exec, exec
2319; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2320; GFX1164-NEXT:    s_not_b64 exec, exec
2321; GFX1164-NEXT:    s_or_saveexec_b64 s[2:3], -1
2322; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2323; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2324; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
2325; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2326; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2327; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2328; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2329; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2330; GFX1164-NEXT:    v_mov_b32_e32 v2, v1
2331; GFX1164-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2332; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2333; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2334; GFX1164-NEXT:    v_readlane_b32 s4, v1, 31
2335; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2336; GFX1164-NEXT:    v_mov_b32_e32 v2, s4
2337; GFX1164-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
2338; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2339; GFX1164-NEXT:    v_readlane_b32 s6, v1, 15
2340; GFX1164-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2341; GFX1164-NEXT:    s_mov_b64 exec, s[2:3]
2342; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2343; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2344; GFX1164-NEXT:    v_readlane_b32 s7, v1, 31
2345; GFX1164-NEXT:    v_writelane_b32 v3, s6, 16
2346; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2347; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2348; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2349; GFX1164-NEXT:    s_or_saveexec_b64 s[4:5], -1
2350; GFX1164-NEXT:    v_readlane_b32 s8, v1, 47
2351; GFX1164-NEXT:    v_readlane_b32 s9, v1, 63
2352; GFX1164-NEXT:    v_writelane_b32 v3, s7, 32
2353; GFX1164-NEXT:    s_mov_b64 exec, s[4:5]
2354; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2355; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2356; GFX1164-NEXT:    s_or_saveexec_b64 s[6:7], -1
2357; GFX1164-NEXT:    s_mov_b32 s4, s9
2358; GFX1164-NEXT:    v_writelane_b32 v3, s8, 48
2359; GFX1164-NEXT:    s_mov_b64 exec, s[6:7]
2360; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
2361; GFX1164-NEXT:    s_mov_b32 s6, -1
2362; GFX1164-NEXT:    ; implicit-def: $vgpr0
2363; GFX1164-NEXT:    s_and_saveexec_b64 s[8:9], vcc
2364; GFX1164-NEXT:    s_cbranch_execz .LBB8_2
2365; GFX1164-NEXT:  ; %bb.1:
2366; GFX1164-NEXT:    v_mov_b32_e32 v0, s4
2367; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
2368; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2369; GFX1164-NEXT:    s_mov_b32 s4, s2
2370; GFX1164-NEXT:    s_mov_b32 s5, s3
2371; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2372; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2373; GFX1164-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
2374; GFX1164-NEXT:    s_waitcnt vmcnt(0)
2375; GFX1164-NEXT:    buffer_gl0_inv
2376; GFX1164-NEXT:    buffer_gl1_inv
2377; GFX1164-NEXT:  .LBB8_2:
2378; GFX1164-NEXT:    s_or_b64 exec, exec, s[8:9]
2379; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2380; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2381; GFX1164-NEXT:    v_mov_b32_e32 v0, v3
2382; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2383; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2384; GFX1164-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2385; GFX1164-NEXT:    s_mov_b32 s2, s6
2386; GFX1164-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2387; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2388; GFX1164-NEXT:    s_endpgm
2389;
2390; GFX1132-LABEL: sub_i32_varying:
2391; GFX1132:       ; %bb.0: ; %entry
2392; GFX1132-NEXT:    v_mov_b32_e32 v1, v0
2393; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2394; GFX1132-NEXT:    v_mov_b32_e32 v1, 0
2395; GFX1132-NEXT:    s_not_b32 exec_lo, exec_lo
2396; GFX1132-NEXT:    s_or_saveexec_b32 s2, -1
2397; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2398; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
2399; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
2400; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2401; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
2402; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
2403; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2404; GFX1132-NEXT:    v_mov_b32_e32 v2, v1
2405; GFX1132-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
2406; GFX1132-NEXT:    s_mov_b32 exec_lo, s2
2407; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2408; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
2409; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2410; GFX1132-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
2411; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
2412; GFX1132-NEXT:    v_readlane_b32 s5, v1, 15
2413; GFX1132-NEXT:    v_readlane_b32 s6, v1, 31
2414; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2415; GFX1132-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
2416; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
2417; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2418; GFX1132-NEXT:    s_or_saveexec_b32 s4, -1
2419; GFX1132-NEXT:    v_writelane_b32 v3, s5, 16
2420; GFX1132-NEXT:    s_mov_b32 exec_lo, s4
2421; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2)
2422; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
2423; GFX1132-NEXT:    s_mov_b32 s4, s6
2424; GFX1132-NEXT:    s_mov_b32 s6, -1
2425; GFX1132-NEXT:    ; implicit-def: $vgpr0
2426; GFX1132-NEXT:    s_and_saveexec_b32 s8, vcc_lo
2427; GFX1132-NEXT:    s_cbranch_execz .LBB8_2
2428; GFX1132-NEXT:  ; %bb.1:
2429; GFX1132-NEXT:    v_mov_b32_e32 v0, s4
2430; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
2431; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2432; GFX1132-NEXT:    s_mov_b32 s4, s2
2433; GFX1132-NEXT:    s_mov_b32 s5, s3
2434; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2435; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2436; GFX1132-NEXT:    buffer_atomic_sub_u32 v0, off, s[4:7], 0 glc
2437; GFX1132-NEXT:    s_waitcnt vmcnt(0)
2438; GFX1132-NEXT:    buffer_gl0_inv
2439; GFX1132-NEXT:    buffer_gl1_inv
2440; GFX1132-NEXT:  .LBB8_2:
2441; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s8
2442; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2443; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2444; GFX1132-NEXT:    v_mov_b32_e32 v0, v3
2445; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2446; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2447; GFX1132-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
2448; GFX1132-NEXT:    s_mov_b32 s2, s6
2449; GFX1132-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
2450; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2451; GFX1132-NEXT:    s_endpgm
2452entry:
2453  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2454  %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel
2455  store i32 %old, i32 addrspace(1)* %out
2456  ret void
2457}
2458
2459define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
2460; GFX7LESS-LABEL: sub_i64_constant:
2461; GFX7LESS:       ; %bb.0: ; %entry
2462; GFX7LESS-NEXT:    s_mov_b64 s[6:7], exec
2463; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
2464; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
2465; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s7, v0
2466; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2467; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2468; GFX7LESS-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2469; GFX7LESS-NEXT:    s_cbranch_execz .LBB9_2
2470; GFX7LESS-NEXT:  ; %bb.1:
2471; GFX7LESS-NEXT:    s_mov_b32 s11, 0xf000
2472; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2473; GFX7LESS-NEXT:    s_mul_i32 s6, s6, 5
2474; GFX7LESS-NEXT:    s_mov_b32 s10, -1
2475; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2476; GFX7LESS-NEXT:    s_mov_b32 s8, s2
2477; GFX7LESS-NEXT:    s_mov_b32 s9, s3
2478; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2479; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
2480; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2481; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2482; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
2483; GFX7LESS-NEXT:    buffer_wbinvl1
2484; GFX7LESS-NEXT:  .LBB9_2:
2485; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[4:5]
2486; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2487; GFX7LESS-NEXT:    s_mov_b32 s3, 0xf000
2488; GFX7LESS-NEXT:    s_mov_b32 s2, -1
2489; GFX7LESS-NEXT:    v_readfirstlane_b32 s4, v0
2490; GFX7LESS-NEXT:    v_readfirstlane_b32 s5, v1
2491; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
2492; GFX7LESS-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2493; GFX7LESS-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2494; GFX7LESS-NEXT:    v_mov_b32_e32 v2, s5
2495; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
2496; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2497; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2498; GFX7LESS-NEXT:    s_endpgm
2499;
2500; GFX8-LABEL: sub_i64_constant:
2501; GFX8:       ; %bb.0: ; %entry
2502; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2503; GFX8-NEXT:    s_mov_b64 s[6:7], exec
2504; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2505; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2506; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2507; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2508; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2509; GFX8-NEXT:    s_cbranch_execz .LBB9_2
2510; GFX8-NEXT:  ; %bb.1:
2511; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2512; GFX8-NEXT:    s_mov_b32 s8, s2
2513; GFX8-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
2514; GFX8-NEXT:    s_mul_i32 s2, s2, 5
2515; GFX8-NEXT:    s_mov_b32 s11, 0xf000
2516; GFX8-NEXT:    s_mov_b32 s10, -1
2517; GFX8-NEXT:    s_mov_b32 s9, s3
2518; GFX8-NEXT:    v_mov_b32_e32 v0, s2
2519; GFX8-NEXT:    v_mov_b32_e32 v1, 0
2520; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2521; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2522; GFX8-NEXT:    s_waitcnt vmcnt(0)
2523; GFX8-NEXT:    buffer_wbinvl1_vol
2524; GFX8-NEXT:  .LBB9_2:
2525; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
2526; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
2527; GFX8-NEXT:    v_readfirstlane_b32 s5, v1
2528; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2529; GFX8-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2530; GFX8-NEXT:    v_mov_b32_e32 v2, s5
2531; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
2532; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2533; GFX8-NEXT:    s_mov_b32 s3, 0xf000
2534; GFX8-NEXT:    s_mov_b32 s2, -1
2535; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
2536; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2537; GFX8-NEXT:    s_endpgm
2538;
2539; GFX9-LABEL: sub_i64_constant:
2540; GFX9:       ; %bb.0: ; %entry
2541; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2542; GFX9-NEXT:    s_mov_b64 s[6:7], exec
2543; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2544; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2545; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2546; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2547; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2548; GFX9-NEXT:    s_cbranch_execz .LBB9_2
2549; GFX9-NEXT:  ; %bb.1:
2550; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2551; GFX9-NEXT:    s_mov_b32 s8, s2
2552; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
2553; GFX9-NEXT:    s_mul_i32 s2, s2, 5
2554; GFX9-NEXT:    s_mov_b32 s11, 0xf000
2555; GFX9-NEXT:    s_mov_b32 s10, -1
2556; GFX9-NEXT:    s_mov_b32 s9, s3
2557; GFX9-NEXT:    v_mov_b32_e32 v0, s2
2558; GFX9-NEXT:    v_mov_b32_e32 v1, 0
2559; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2560; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2561; GFX9-NEXT:    s_waitcnt vmcnt(0)
2562; GFX9-NEXT:    buffer_wbinvl1_vol
2563; GFX9-NEXT:  .LBB9_2:
2564; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
2565; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
2566; GFX9-NEXT:    v_readfirstlane_b32 s5, v1
2567; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2568; GFX9-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2569; GFX9-NEXT:    v_mov_b32_e32 v2, s5
2570; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
2571; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2572; GFX9-NEXT:    s_mov_b32 s3, 0xf000
2573; GFX9-NEXT:    s_mov_b32 s2, -1
2574; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2575; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2576; GFX9-NEXT:    s_endpgm
2577;
2578; GFX1064-LABEL: sub_i64_constant:
2579; GFX1064:       ; %bb.0: ; %entry
2580; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2581; GFX1064-NEXT:    s_mov_b64 s[6:7], exec
2582; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2583; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2584; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2585; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2586; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
2587; GFX1064-NEXT:    s_cbranch_execz .LBB9_2
2588; GFX1064-NEXT:  ; %bb.1:
2589; GFX1064-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2590; GFX1064-NEXT:    v_mov_b32_e32 v1, 0
2591; GFX1064-NEXT:    s_mul_i32 s6, s6, 5
2592; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
2593; GFX1064-NEXT:    v_mov_b32_e32 v0, s6
2594; GFX1064-NEXT:    s_mov_b32 s10, -1
2595; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2596; GFX1064-NEXT:    s_mov_b32 s8, s2
2597; GFX1064-NEXT:    s_mov_b32 s9, s3
2598; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2599; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2600; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2601; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2602; GFX1064-NEXT:    buffer_gl0_inv
2603; GFX1064-NEXT:    buffer_gl1_inv
2604; GFX1064-NEXT:  .LBB9_2:
2605; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2606; GFX1064-NEXT:    s_or_b64 exec, exec, s[4:5]
2607; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2608; GFX1064-NEXT:    v_readfirstlane_b32 s2, v0
2609; GFX1064-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2610; GFX1064-NEXT:    v_readfirstlane_b32 s3, v1
2611; GFX1064-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2612; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2613; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2614; GFX1064-NEXT:    s_mov_b32 s3, 0x31016000
2615; GFX1064-NEXT:    s_mov_b32 s2, -1
2616; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2617; GFX1064-NEXT:    s_endpgm
2618;
2619; GFX1032-LABEL: sub_i64_constant:
2620; GFX1032:       ; %bb.0: ; %entry
2621; GFX1032-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
2622; GFX1032-NEXT:    s_mov_b32 s5, exec_lo
2623; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2624; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
2625; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2626; GFX1032-NEXT:    s_and_saveexec_b32 s4, vcc_lo
2627; GFX1032-NEXT:    s_cbranch_execz .LBB9_2
2628; GFX1032-NEXT:  ; %bb.1:
2629; GFX1032-NEXT:    s_bcnt1_i32_b32 s5, s5
2630; GFX1032-NEXT:    v_mov_b32_e32 v1, 0
2631; GFX1032-NEXT:    s_mul_i32 s5, s5, 5
2632; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
2633; GFX1032-NEXT:    v_mov_b32_e32 v0, s5
2634; GFX1032-NEXT:    s_mov_b32 s10, -1
2635; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2636; GFX1032-NEXT:    s_mov_b32 s8, s2
2637; GFX1032-NEXT:    s_mov_b32 s9, s3
2638; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2639; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2640; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2641; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2642; GFX1032-NEXT:    buffer_gl0_inv
2643; GFX1032-NEXT:    buffer_gl1_inv
2644; GFX1032-NEXT:  .LBB9_2:
2645; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2646; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2647; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2648; GFX1032-NEXT:    v_readfirstlane_b32 s2, v0
2649; GFX1032-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2650; GFX1032-NEXT:    v_readfirstlane_b32 s3, v1
2651; GFX1032-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2652; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2653; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2654; GFX1032-NEXT:    s_mov_b32 s3, 0x31016000
2655; GFX1032-NEXT:    s_mov_b32 s2, -1
2656; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2657; GFX1032-NEXT:    s_endpgm
2658;
2659; GFX1164-LABEL: sub_i64_constant:
2660; GFX1164:       ; %bb.0: ; %entry
2661; GFX1164-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2662; GFX1164-NEXT:    s_mov_b64 s[6:7], exec
2663; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
2664; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
2665; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2666; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
2667; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2668; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2669; GFX1164-NEXT:    s_cbranch_execz .LBB9_2
2670; GFX1164-NEXT:  ; %bb.1:
2671; GFX1164-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
2672; GFX1164-NEXT:    v_mov_b32_e32 v1, 0
2673; GFX1164-NEXT:    s_mul_i32 s6, s6, 5
2674; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
2675; GFX1164-NEXT:    v_mov_b32_e32 v0, s6
2676; GFX1164-NEXT:    s_mov_b32 s10, -1
2677; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2678; GFX1164-NEXT:    s_mov_b32 s8, s2
2679; GFX1164-NEXT:    s_mov_b32 s9, s3
2680; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2681; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
2682; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
2683; GFX1164-NEXT:    s_waitcnt vmcnt(0)
2684; GFX1164-NEXT:    buffer_gl0_inv
2685; GFX1164-NEXT:    buffer_gl1_inv
2686; GFX1164-NEXT:  .LBB9_2:
2687; GFX1164-NEXT:    s_or_b64 exec, exec, s[4:5]
2688; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2689; GFX1164-NEXT:    v_readfirstlane_b32 s2, v0
2690; GFX1164-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2691; GFX1164-NEXT:    v_readfirstlane_b32 s3, v1
2692; GFX1164-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2693; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2694; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s2, v0
2695; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc
2696; GFX1164-NEXT:    s_mov_b32 s3, 0x31016000
2697; GFX1164-NEXT:    s_mov_b32 s2, -1
2698; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2699; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2700; GFX1164-NEXT:    s_endpgm
2701;
2702; GFX1132-LABEL: sub_i64_constant:
2703; GFX1132:       ; %bb.0: ; %entry
2704; GFX1132-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
2705; GFX1132-NEXT:    s_mov_b32 s5, exec_lo
2706; GFX1132-NEXT:    s_mov_b32 s4, exec_lo
2707; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s5, 0
2708; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
2709; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2710; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
2711; GFX1132-NEXT:    s_cbranch_execz .LBB9_2
2712; GFX1132-NEXT:  ; %bb.1:
2713; GFX1132-NEXT:    s_bcnt1_i32_b32 s5, s5
2714; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
2715; GFX1132-NEXT:    s_mul_i32 s5, s5, 5
2716; GFX1132-NEXT:    s_mov_b32 s10, -1
2717; GFX1132-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
2718; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2719; GFX1132-NEXT:    s_mov_b32 s8, s2
2720; GFX1132-NEXT:    s_mov_b32 s9, s3
2721; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2722; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
2723; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
2724; GFX1132-NEXT:    s_waitcnt vmcnt(0)
2725; GFX1132-NEXT:    buffer_gl0_inv
2726; GFX1132-NEXT:    buffer_gl1_inv
2727; GFX1132-NEXT:  .LBB9_2:
2728; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s4
2729; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
2730; GFX1132-NEXT:    v_readfirstlane_b32 s2, v0
2731; GFX1132-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
2732; GFX1132-NEXT:    v_readfirstlane_b32 s3, v1
2733; GFX1132-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
2734; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2735; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s2, v0
2736; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo
2737; GFX1132-NEXT:    s_mov_b32 s3, 0x31016000
2738; GFX1132-NEXT:    s_mov_b32 s2, -1
2739; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
2740; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2741; GFX1132-NEXT:    s_endpgm
2742entry:
2743  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel
2744  store i64 %old, i64 addrspace(1)* %out
2745  ret void
2746}
2747
2748define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) {
2749; GFX7LESS-LABEL: sub_i64_uniform:
2750; GFX7LESS:       ; %bb.0: ; %entry
2751; GFX7LESS-NEXT:    s_mov_b64 s[8:9], exec
2752; GFX7LESS-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
2753; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
2754; GFX7LESS-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s8, 0
2755; GFX7LESS-NEXT:    v_mbcnt_hi_u32_b32_e32 v2, s9, v0
2756; GFX7LESS-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2757; GFX7LESS-NEXT:    ; implicit-def: $vgpr0_vgpr1
2758; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2759; GFX7LESS-NEXT:    s_cbranch_execz .LBB10_2
2760; GFX7LESS-NEXT:  ; %bb.1:
2761; GFX7LESS-NEXT:    s_mov_b32 s15, 0xf000
2762; GFX7LESS-NEXT:    s_mov_b32 s14, -1
2763; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2764; GFX7LESS-NEXT:    s_mov_b32 s12, s6
2765; GFX7LESS-NEXT:    s_mov_b32 s13, s7
2766; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
2767; GFX7LESS-NEXT:    s_mul_i32 s7, s1, s6
2768; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2769; GFX7LESS-NEXT:    v_mul_hi_u32 v0, s0, v0
2770; GFX7LESS-NEXT:    s_mul_i32 s6, s0, s6
2771; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, s7, v0
2772; GFX7LESS-NEXT:    v_mov_b32_e32 v0, s6
2773; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2774; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
2775; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
2776; GFX7LESS-NEXT:    buffer_wbinvl1
2777; GFX7LESS-NEXT:  .LBB10_2:
2778; GFX7LESS-NEXT:    s_or_b64 exec, exec, s[2:3]
2779; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
2780; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
2781; GFX7LESS-NEXT:    s_mov_b32 s6, -1
2782; GFX7LESS-NEXT:    v_readfirstlane_b32 s2, v0
2783; GFX7LESS-NEXT:    v_readfirstlane_b32 s3, v1
2784; GFX7LESS-NEXT:    s_waitcnt expcnt(0)
2785; GFX7LESS-NEXT:    v_mul_lo_u32 v0, s1, v2
2786; GFX7LESS-NEXT:    v_mul_hi_u32 v1, s0, v2
2787; GFX7LESS-NEXT:    v_mul_lo_u32 v2, s0, v2
2788; GFX7LESS-NEXT:    v_add_i32_e32 v1, vcc, v1, v0
2789; GFX7LESS-NEXT:    v_mov_b32_e32 v3, s3
2790; GFX7LESS-NEXT:    v_sub_i32_e32 v0, vcc, s2, v2
2791; GFX7LESS-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2792; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2793; GFX7LESS-NEXT:    s_endpgm
2794;
2795; GFX8-LABEL: sub_i64_uniform:
2796; GFX8:       ; %bb.0: ; %entry
2797; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2798; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
2799; GFX8-NEXT:    s_mov_b64 s[8:9], exec
2800; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2801; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2802; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2803; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1
2804; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
2805; GFX8-NEXT:    s_cbranch_execz .LBB10_2
2806; GFX8-NEXT:  ; %bb.1:
2807; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2808; GFX8-NEXT:    s_mov_b32 s12, s6
2809; GFX8-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
2810; GFX8-NEXT:    v_mov_b32_e32 v0, s6
2811; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
2812; GFX8-NEXT:    s_mul_i32 s6, s1, s6
2813; GFX8-NEXT:    s_mov_b32 s15, 0xf000
2814; GFX8-NEXT:    s_mov_b32 s14, -1
2815; GFX8-NEXT:    s_mov_b32 s13, s7
2816; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
2817; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2818; GFX8-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
2819; GFX8-NEXT:    s_waitcnt vmcnt(0)
2820; GFX8-NEXT:    buffer_wbinvl1_vol
2821; GFX8-NEXT:  .LBB10_2:
2822; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
2823; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2824; GFX8-NEXT:    v_mul_lo_u32 v4, s1, v2
2825; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
2826; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
2827; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
2828; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v4
2829; GFX8-NEXT:    v_mov_b32_e32 v3, s1
2830; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v2
2831; GFX8-NEXT:    s_mov_b32 s7, 0xf000
2832; GFX8-NEXT:    s_mov_b32 s6, -1
2833; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
2834; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2835; GFX8-NEXT:    s_endpgm
2836;
2837; GFX9-LABEL: sub_i64_uniform:
2838; GFX9:       ; %bb.0: ; %entry
2839; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2840; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2841; GFX9-NEXT:    s_mov_b64 s[8:9], exec
2842; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2843; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2844; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2845; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
2846; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2847; GFX9-NEXT:    s_cbranch_execz .LBB10_2
2848; GFX9-NEXT:  ; %bb.1:
2849; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2850; GFX9-NEXT:    s_mov_b32 s12, s6
2851; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[8:9]
2852; GFX9-NEXT:    s_mov_b32 s13, s7
2853; GFX9-NEXT:    s_mul_i32 s7, s3, s6
2854; GFX9-NEXT:    s_mul_hi_u32 s8, s2, s6
2855; GFX9-NEXT:    s_add_i32 s8, s8, s7
2856; GFX9-NEXT:    s_mul_i32 s6, s2, s6
2857; GFX9-NEXT:    s_mov_b32 s15, 0xf000
2858; GFX9-NEXT:    s_mov_b32 s14, -1
2859; GFX9-NEXT:    v_mov_b32_e32 v0, s6
2860; GFX9-NEXT:    v_mov_b32_e32 v1, s8
2861; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2862; GFX9-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
2863; GFX9-NEXT:    s_waitcnt vmcnt(0)
2864; GFX9-NEXT:    buffer_wbinvl1_vol
2865; GFX9-NEXT:  .LBB10_2:
2866; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
2867; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2868; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
2869; GFX9-NEXT:    s_mov_b32 s7, 0xf000
2870; GFX9-NEXT:    s_mov_b32 s6, -1
2871; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
2872; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
2873; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
2874; GFX9-NEXT:    v_mov_b32_e32 v1, v4
2875; GFX9-NEXT:    v_mov_b32_e32 v2, s1
2876; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s0, v3
2877; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
2878; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2879; GFX9-NEXT:    s_endpgm
2880;
2881; GFX1064-LABEL: sub_i64_uniform:
2882; GFX1064:       ; %bb.0: ; %entry
2883; GFX1064-NEXT:    s_clause 0x1
2884; GFX1064-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2885; GFX1064-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2886; GFX1064-NEXT:    s_mov_b64 s[8:9], exec
2887; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2888; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2889; GFX1064-NEXT:    ; implicit-def: $vgpr0_vgpr1
2890; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
2891; GFX1064-NEXT:    s_and_saveexec_b64 s[0:1], vcc
2892; GFX1064-NEXT:    s_cbranch_execz .LBB10_2
2893; GFX1064-NEXT:  ; %bb.1:
2894; GFX1064-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2895; GFX1064-NEXT:    s_mov_b32 s11, 0x31016000
2896; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2897; GFX1064-NEXT:    s_mul_i32 s9, s3, s8
2898; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s8
2899; GFX1064-NEXT:    s_mul_i32 s8, s2, s8
2900; GFX1064-NEXT:    s_add_i32 s10, s10, s9
2901; GFX1064-NEXT:    v_mov_b32_e32 v0, s8
2902; GFX1064-NEXT:    v_mov_b32_e32 v1, s10
2903; GFX1064-NEXT:    s_mov_b32 s10, -1
2904; GFX1064-NEXT:    s_mov_b32 s8, s6
2905; GFX1064-NEXT:    s_mov_b32 s9, s7
2906; GFX1064-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2907; GFX1064-NEXT:    s_waitcnt_vscnt null, 0x0
2908; GFX1064-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2909; GFX1064-NEXT:    s_waitcnt vmcnt(0)
2910; GFX1064-NEXT:    buffer_gl0_inv
2911; GFX1064-NEXT:    buffer_gl1_inv
2912; GFX1064-NEXT:  .LBB10_2:
2913; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
2914; GFX1064-NEXT:    s_or_b64 exec, exec, s[0:1]
2915; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
2916; GFX1064-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v2, 0
2917; GFX1064-NEXT:    s_mov_b32 s7, 0x31016000
2918; GFX1064-NEXT:    s_mov_b32 s6, -1
2919; GFX1064-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5]
2920; GFX1064-NEXT:    v_readfirstlane_b32 s0, v0
2921; GFX1064-NEXT:    v_readfirstlane_b32 s1, v1
2922; GFX1064-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
2923; GFX1064-NEXT:    v_mov_b32_e32 v1, v4
2924; GFX1064-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
2925; GFX1064-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2926; GFX1064-NEXT:    s_endpgm
2927;
2928; GFX1032-LABEL: sub_i64_uniform:
2929; GFX1032:       ; %bb.0: ; %entry
2930; GFX1032-NEXT:    s_clause 0x1
2931; GFX1032-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
2932; GFX1032-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
2933; GFX1032-NEXT:    s_mov_b32 s8, exec_lo
2934; GFX1032-NEXT:    ; implicit-def: $vgpr0_vgpr1
2935; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v2, s8, 0
2936; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
2937; GFX1032-NEXT:    s_and_saveexec_b32 s0, vcc_lo
2938; GFX1032-NEXT:    s_cbranch_execz .LBB10_2
2939; GFX1032-NEXT:  ; %bb.1:
2940; GFX1032-NEXT:    s_bcnt1_i32_b32 s1, s8
2941; GFX1032-NEXT:    s_mov_b32 s11, 0x31016000
2942; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2943; GFX1032-NEXT:    s_mul_i32 s8, s3, s1
2944; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s1
2945; GFX1032-NEXT:    s_mul_i32 s1, s2, s1
2946; GFX1032-NEXT:    s_add_i32 s9, s9, s8
2947; GFX1032-NEXT:    v_mov_b32_e32 v0, s1
2948; GFX1032-NEXT:    v_mov_b32_e32 v1, s9
2949; GFX1032-NEXT:    s_mov_b32 s10, -1
2950; GFX1032-NEXT:    s_mov_b32 s8, s6
2951; GFX1032-NEXT:    s_mov_b32 s9, s7
2952; GFX1032-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2953; GFX1032-NEXT:    s_waitcnt_vscnt null, 0x0
2954; GFX1032-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
2955; GFX1032-NEXT:    s_waitcnt vmcnt(0)
2956; GFX1032-NEXT:    buffer_gl0_inv
2957; GFX1032-NEXT:    buffer_gl1_inv
2958; GFX1032-NEXT:  .LBB10_2:
2959; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
2960; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s0
2961; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
2962; GFX1032-NEXT:    v_mad_u64_u32 v[3:4], s0, s2, v2, 0
2963; GFX1032-NEXT:    v_readfirstlane_b32 s1, v1
2964; GFX1032-NEXT:    s_mov_b32 s7, 0x31016000
2965; GFX1032-NEXT:    s_mov_b32 s6, -1
2966; GFX1032-NEXT:    v_mad_u64_u32 v[4:5], s0, s3, v2, v[4:5]
2967; GFX1032-NEXT:    v_readfirstlane_b32 s0, v0
2968; GFX1032-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
2969; GFX1032-NEXT:    v_mov_b32_e32 v1, v4
2970; GFX1032-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2971; GFX1032-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2972; GFX1032-NEXT:    s_endpgm
2973;
2974; GFX1164-LABEL: sub_i64_uniform:
2975; GFX1164:       ; %bb.0: ; %entry
2976; GFX1164-NEXT:    s_clause 0x1
2977; GFX1164-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
2978; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
2979; GFX1164-NEXT:    s_mov_b64 s[8:9], exec
2980; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
2981; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
2982; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2983; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
2984; GFX1164-NEXT:    ; implicit-def: $vgpr0_vgpr1
2985; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
2986; GFX1164-NEXT:    s_cbranch_execz .LBB10_2
2987; GFX1164-NEXT:  ; %bb.1:
2988; GFX1164-NEXT:    s_bcnt1_i32_b64 s8, s[8:9]
2989; GFX1164-NEXT:    s_mov_b32 s11, 0x31016000
2990; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
2991; GFX1164-NEXT:    s_mul_i32 s9, s1, s8
2992; GFX1164-NEXT:    s_mul_hi_u32 s10, s0, s8
2993; GFX1164-NEXT:    s_mul_i32 s8, s0, s8
2994; GFX1164-NEXT:    s_add_i32 s10, s10, s9
2995; GFX1164-NEXT:    v_mov_b32_e32 v0, s8
2996; GFX1164-NEXT:    v_mov_b32_e32 v1, s10
2997; GFX1164-NEXT:    s_mov_b32 s10, -1
2998; GFX1164-NEXT:    s_mov_b32 s8, s6
2999; GFX1164-NEXT:    s_mov_b32 s9, s7
3000; GFX1164-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3001; GFX1164-NEXT:    s_waitcnt_vscnt null, 0x0
3002; GFX1164-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
3003; GFX1164-NEXT:    s_waitcnt vmcnt(0)
3004; GFX1164-NEXT:    buffer_gl0_inv
3005; GFX1164-NEXT:    buffer_gl1_inv
3006; GFX1164-NEXT:  .LBB10_2:
3007; GFX1164-NEXT:    s_or_b64 exec, exec, s[2:3]
3008; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
3009; GFX1164-NEXT:    v_mad_u64_u32 v[3:4], null, s0, v2, 0
3010; GFX1164-NEXT:    v_readfirstlane_b32 s0, v0
3011; GFX1164-NEXT:    s_mov_b32 s7, 0x31016000
3012; GFX1164-NEXT:    s_mov_b32 s6, -1
3013; GFX1164-NEXT:    s_waitcnt_depctr 0xfff
3014; GFX1164-NEXT:    v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
3015; GFX1164-NEXT:    v_readfirstlane_b32 s1, v1
3016; GFX1164-NEXT:    v_sub_co_u32 v0, vcc, s0, v3
3017; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3018; GFX1164-NEXT:    v_mov_b32_e32 v1, v5
3019; GFX1164-NEXT:    v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
3020; GFX1164-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
3021; GFX1164-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3022; GFX1164-NEXT:    s_endpgm
3023;
3024; GFX1132-LABEL: sub_i64_uniform:
3025; GFX1132:       ; %bb.0: ; %entry
3026; GFX1132-NEXT:    s_clause 0x1
3027; GFX1132-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
3028; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
3029; GFX1132-NEXT:    s_mov_b32 s3, exec_lo
3030; GFX1132-NEXT:    s_mov_b32 s2, exec_lo
3031; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v2, s3, 0
3032; GFX1132-NEXT:    ; implicit-def: $vgpr0_vgpr1
3033; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3034; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
3035; GFX1132-NEXT:    s_cbranch_execz .LBB10_2
3036; GFX1132-NEXT:  ; %bb.1:
3037; GFX1132-NEXT:    s_bcnt1_i32_b32 s3, s3
3038; GFX1132-NEXT:    s_mov_b32 s11, 0x31016000
3039; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3040; GFX1132-NEXT:    s_mul_i32 s8, s1, s3
3041; GFX1132-NEXT:    s_mul_hi_u32 s9, s0, s3
3042; GFX1132-NEXT:    s_mul_i32 s3, s0, s3
3043; GFX1132-NEXT:    s_add_i32 s9, s9, s8
3044; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
3045; GFX1132-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9
3046; GFX1132-NEXT:    s_mov_b32 s10, -1
3047; GFX1132-NEXT:    s_mov_b32 s8, s6
3048; GFX1132-NEXT:    s_mov_b32 s9, s7
3049; GFX1132-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3050; GFX1132-NEXT:    s_waitcnt_vscnt null, 0x0
3051; GFX1132-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
3052; GFX1132-NEXT:    s_waitcnt vmcnt(0)
3053; GFX1132-NEXT:    buffer_gl0_inv
3054; GFX1132-NEXT:    buffer_gl1_inv
3055; GFX1132-NEXT:  .LBB10_2:
3056; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s2
3057; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
3058; GFX1132-NEXT:    v_mad_u64_u32 v[3:4], null, s0, v2, 0
3059; GFX1132-NEXT:    v_readfirstlane_b32 s0, v0
3060; GFX1132-NEXT:    s_mov_b32 s7, 0x31016000
3061; GFX1132-NEXT:    s_mov_b32 s6, -1
3062; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
3063; GFX1132-NEXT:    v_mad_u64_u32 v[5:6], null, s1, v2, v[4:5]
3064; GFX1132-NEXT:    v_readfirstlane_b32 s1, v1
3065; GFX1132-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v3
3066; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3067; GFX1132-NEXT:    v_mov_b32_e32 v1, v5
3068; GFX1132-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
3069; GFX1132-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
3070; GFX1132-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3071; GFX1132-NEXT:    s_endpgm
3072entry:
3073  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel
3074  store i64 %old, i64 addrspace(1)* %out
3075  ret void
3076}
3077
3078define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) {
3079; GFX7LESS-LABEL: sub_i64_varying:
3080; GFX7LESS:       ; %bb.0: ; %entry
3081; GFX7LESS-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
3082; GFX7LESS-NEXT:    s_mov_b32 s7, 0xf000
3083; GFX7LESS-NEXT:    s_mov_b32 s6, -1
3084; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
3085; GFX7LESS-NEXT:    s_mov_b32 s10, s6
3086; GFX7LESS-NEXT:    s_mov_b32 s11, s7
3087; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
3088; GFX7LESS-NEXT:    s_mov_b32 s8, s2
3089; GFX7LESS-NEXT:    s_mov_b32 s9, s3
3090; GFX7LESS-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3091; GFX7LESS-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
3092; GFX7LESS-NEXT:    s_waitcnt vmcnt(0)
3093; GFX7LESS-NEXT:    buffer_wbinvl1
3094; GFX7LESS-NEXT:    s_mov_b32 s4, s0
3095; GFX7LESS-NEXT:    s_mov_b32 s5, s1
3096; GFX7LESS-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3097; GFX7LESS-NEXT:    s_endpgm
3098;
3099; GFX89-LABEL: sub_i64_varying:
3100; GFX89:       ; %bb.0: ; %entry
3101; GFX89-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3102; GFX89-NEXT:    s_mov_b32 s7, 0xf000
3103; GFX89-NEXT:    s_mov_b32 s6, -1
3104; GFX89-NEXT:    s_mov_b32 s10, s6
3105; GFX89-NEXT:    s_mov_b32 s11, s7
3106; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
3107; GFX89-NEXT:    s_mov_b32 s8, s2
3108; GFX89-NEXT:    s_mov_b32 s9, s3
3109; GFX89-NEXT:    v_mov_b32_e32 v1, 0
3110; GFX89-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3111; GFX89-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
3112; GFX89-NEXT:    s_waitcnt vmcnt(0)
3113; GFX89-NEXT:    buffer_wbinvl1_vol
3114; GFX89-NEXT:    s_mov_b32 s4, s0
3115; GFX89-NEXT:    s_mov_b32 s5, s1
3116; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3117; GFX89-NEXT:    s_endpgm
3118;
3119; GFX10-LABEL: sub_i64_varying:
3120; GFX10:       ; %bb.0: ; %entry
3121; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
3122; GFX10-NEXT:    v_mov_b32_e32 v1, 0
3123; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
3124; GFX10-NEXT:    s_mov_b32 s6, -1
3125; GFX10-NEXT:    s_mov_b32 s11, s7
3126; GFX10-NEXT:    s_mov_b32 s10, s6
3127; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3128; GFX10-NEXT:    s_mov_b32 s8, s2
3129; GFX10-NEXT:    s_mov_b32 s9, s3
3130; GFX10-NEXT:    s_mov_b32 s4, s0
3131; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3132; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3133; GFX10-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc
3134; GFX10-NEXT:    s_waitcnt vmcnt(0)
3135; GFX10-NEXT:    buffer_gl0_inv
3136; GFX10-NEXT:    buffer_gl1_inv
3137; GFX10-NEXT:    s_mov_b32 s5, s1
3138; GFX10-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3139; GFX10-NEXT:    s_endpgm
3140;
3141; GFX11-LABEL: sub_i64_varying:
3142; GFX11:       ; %bb.0: ; %entry
3143; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
3144; GFX11-NEXT:    v_mov_b32_e32 v1, 0
3145; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
3146; GFX11-NEXT:    s_mov_b32 s6, -1
3147; GFX11-NEXT:    s_mov_b32 s11, s7
3148; GFX11-NEXT:    s_mov_b32 s10, s6
3149; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3150; GFX11-NEXT:    s_mov_b32 s8, s2
3151; GFX11-NEXT:    s_mov_b32 s9, s3
3152; GFX11-NEXT:    s_mov_b32 s4, s0
3153; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
3154; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3155; GFX11-NEXT:    buffer_atomic_sub_u64 v[0:1], off, s[8:11], 0 glc
3156; GFX11-NEXT:    s_waitcnt vmcnt(0)
3157; GFX11-NEXT:    buffer_gl0_inv
3158; GFX11-NEXT:    buffer_gl1_inv
3159; GFX11-NEXT:    s_mov_b32 s5, s1
3160; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
3161; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
3162; GFX11-NEXT:    s_endpgm
3163entry:
3164  %lane = call i32 @llvm.amdgcn.workitem.id.x()
3165  %zext = zext i32 %lane to i64
3166  %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %zext acq_rel
3167  store i64 %old, i64 addrspace(1)* %out
3168  ret void
3169}
3170