1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
12declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)
13
14; Show what the atomic optimization pass will do for struct buffers.
15
16define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
17; GFX6-LABEL: add_i32_constant:
18; GFX6:       ; %bb.0: ; %entry
19; GFX6-NEXT:    s_mov_b64 s[6:7], exec
20; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
21; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
22; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
23; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
24; GFX6-NEXT:    ; implicit-def: $vgpr1
25; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
26; GFX6-NEXT:    s_cbranch_execz .LBB0_2
27; GFX6-NEXT:  ; %bb.1:
28; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
29; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
30; GFX6-NEXT:    s_mul_i32 s0, s0, 5
31; GFX6-NEXT:    v_mov_b32_e32 v1, s0
32; GFX6-NEXT:    v_mov_b32_e32 v2, 0
33; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX6-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
35; GFX6-NEXT:  .LBB0_2:
36; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
37; GFX6-NEXT:    s_waitcnt vmcnt(0)
38; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
39; GFX6-NEXT:    s_mov_b32 s7, 0xf000
40; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
41; GFX6-NEXT:    s_mov_b32 s6, -1
42; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
44; GFX6-NEXT:    s_endpgm
45;
46; GFX8-LABEL: add_i32_constant:
47; GFX8:       ; %bb.0: ; %entry
48; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
49; GFX8-NEXT:    s_mov_b64 s[6:7], exec
50; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
51; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
52; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX8-NEXT:    ; implicit-def: $vgpr1
54; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; GFX8-NEXT:    s_cbranch_execz .LBB0_2
56; GFX8-NEXT:  ; %bb.1:
57; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
58; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
59; GFX8-NEXT:    s_mul_i32 s0, s0, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, s0
61; GFX8-NEXT:    v_mov_b32_e32 v2, 0
62; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX8-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
64; GFX8-NEXT:  .LBB0_2:
65; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
66; GFX8-NEXT:    s_waitcnt vmcnt(0)
67; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
68; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
69; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
70; GFX8-NEXT:    v_mov_b32_e32 v0, s2
71; GFX8-NEXT:    v_mov_b32_e32 v1, s3
72; GFX8-NEXT:    flat_store_dword v[0:1], v2
73; GFX8-NEXT:    s_endpgm
74;
75; GFX9-LABEL: add_i32_constant:
76; GFX9:       ; %bb.0: ; %entry
77; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
78; GFX9-NEXT:    s_mov_b64 s[6:7], exec
79; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
80; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
81; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
82; GFX9-NEXT:    ; implicit-def: $vgpr1
83; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
84; GFX9-NEXT:    s_cbranch_execz .LBB0_2
85; GFX9-NEXT:  ; %bb.1:
86; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
87; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
88; GFX9-NEXT:    s_mul_i32 s0, s0, 5
89; GFX9-NEXT:    v_mov_b32_e32 v1, s0
90; GFX9-NEXT:    v_mov_b32_e32 v2, 0
91; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
92; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
93; GFX9-NEXT:  .LBB0_2:
94; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
95; GFX9-NEXT:    s_waitcnt vmcnt(0)
96; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
97; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
98; GFX9-NEXT:    v_mov_b32_e32 v1, 0
99; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
100; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
101; GFX9-NEXT:    s_endpgm
102;
103; GFX10W64-LABEL: add_i32_constant:
104; GFX10W64:       ; %bb.0: ; %entry
105; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
106; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
107; GFX10W64-NEXT:    ; implicit-def: $vgpr1
108; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
109; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
110; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
111; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
112; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
113; GFX10W64-NEXT:  ; %bb.1:
114; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
115; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
116; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
117; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
118; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
119; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
120; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
121; GFX10W64-NEXT:  .LBB0_2:
122; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
123; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
124; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
125; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
126; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
127; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
128; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
129; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
130; GFX10W64-NEXT:    s_endpgm
131;
132; GFX10W32-LABEL: add_i32_constant:
133; GFX10W32:       ; %bb.0: ; %entry
134; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
135; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
136; GFX10W32-NEXT:    ; implicit-def: $vgpr1
137; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
138; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
139; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
140; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
141; GFX10W32-NEXT:  ; %bb.1:
142; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
143; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
144; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
145; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
146; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
147; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
149; GFX10W32-NEXT:  .LBB0_2:
150; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
151; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
152; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
153; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
154; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
155; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
156; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
158; GFX10W32-NEXT:    s_endpgm
159;
160; GFX11W64-LABEL: add_i32_constant:
161; GFX11W64:       ; %bb.0: ; %entry
162; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
163; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
164; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
165; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
166; GFX11W64-NEXT:    ; implicit-def: $vgpr1
167; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
168; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
169; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
170; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
171; GFX11W64-NEXT:  ; %bb.1:
172; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
173; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
174; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
175; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
176; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
177; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
178; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
180; GFX11W64-NEXT:  .LBB0_2:
181; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
182; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
183; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
184; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
185; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
186; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
187; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
189; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
190; GFX11W64-NEXT:    s_endpgm
191;
192; GFX11W32-LABEL: add_i32_constant:
193; GFX11W32:       ; %bb.0: ; %entry
194; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
195; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
196; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
197; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
198; GFX11W32-NEXT:    ; implicit-def: $vgpr1
199; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
200; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
201; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
202; GFX11W32-NEXT:  ; %bb.1:
203; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
204; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
205; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
206; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
207; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
208; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
209; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
211; GFX11W32-NEXT:  .LBB0_2:
212; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
213; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
214; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
215; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
216; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
217; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
218; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
219; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
220; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
221; GFX11W32-NEXT:    s_endpgm
222entry:
223  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
224  store i32 %old, i32 addrspace(1)* %out
225  ret void
226}
227
228define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
229; GFX6-LABEL: add_i32_uniform:
230; GFX6:       ; %bb.0: ; %entry
231; GFX6-NEXT:    s_mov_b64 s[2:3], exec
232; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
233; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
234; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
235; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
236; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
237; GFX6-NEXT:    ; implicit-def: $vgpr1
238; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
239; GFX6-NEXT:    s_cbranch_execz .LBB1_2
240; GFX6-NEXT:  ; %bb.1:
241; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
242; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
243; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX6-NEXT:    s_mul_i32 s0, s8, s0
245; GFX6-NEXT:    v_mov_b32_e32 v1, s0
246; GFX6-NEXT:    v_mov_b32_e32 v2, 0
247; GFX6-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
248; GFX6-NEXT:  .LBB1_2:
249; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
250; GFX6-NEXT:    s_waitcnt vmcnt(0)
251; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
252; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
254; GFX6-NEXT:    s_mov_b32 s7, 0xf000
255; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
256; GFX6-NEXT:    s_mov_b32 s6, -1
257; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
258; GFX6-NEXT:    s_endpgm
259;
260; GFX8-LABEL: add_i32_uniform:
261; GFX8:       ; %bb.0: ; %entry
262; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
263; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
264; GFX8-NEXT:    s_mov_b64 s[6:7], exec
265; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
266; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
267; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
268; GFX8-NEXT:    ; implicit-def: $vgpr1
269; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
270; GFX8-NEXT:    s_cbranch_execz .LBB1_2
271; GFX8-NEXT:  ; %bb.1:
272; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
273; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
274; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
275; GFX8-NEXT:    s_mul_i32 s0, s8, s0
276; GFX8-NEXT:    v_mov_b32_e32 v1, s0
277; GFX8-NEXT:    v_mov_b32_e32 v2, 0
278; GFX8-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
279; GFX8-NEXT:  .LBB1_2:
280; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
281; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
283; GFX8-NEXT:    s_waitcnt vmcnt(0)
284; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
285; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
286; GFX8-NEXT:    v_mov_b32_e32 v0, s2
287; GFX8-NEXT:    v_mov_b32_e32 v1, s3
288; GFX8-NEXT:    flat_store_dword v[0:1], v2
289; GFX8-NEXT:    s_endpgm
290;
291; GFX9-LABEL: add_i32_uniform:
292; GFX9:       ; %bb.0: ; %entry
293; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
294; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
295; GFX9-NEXT:    s_mov_b64 s[6:7], exec
296; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
297; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
298; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
299; GFX9-NEXT:    ; implicit-def: $vgpr1
300; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
301; GFX9-NEXT:    s_cbranch_execz .LBB1_2
302; GFX9-NEXT:  ; %bb.1:
303; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
304; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX9-NEXT:    s_mul_i32 s0, s8, s0
307; GFX9-NEXT:    v_mov_b32_e32 v1, s0
308; GFX9-NEXT:    v_mov_b32_e32 v2, 0
309; GFX9-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
310; GFX9-NEXT:  .LBB1_2:
311; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
312; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
314; GFX9-NEXT:    s_waitcnt vmcnt(0)
315; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
316; GFX9-NEXT:    v_mov_b32_e32 v1, 0
317; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
318; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
319; GFX9-NEXT:    s_endpgm
320;
321; GFX10W64-LABEL: add_i32_uniform:
322; GFX10W64:       ; %bb.0: ; %entry
323; GFX10W64-NEXT:    s_clause 0x1
324; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
325; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
326; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
327; GFX10W64-NEXT:    ; implicit-def: $vgpr1
328; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
329; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
330; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
331; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
332; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
333; GFX10W64-NEXT:  ; %bb.1:
334; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
335; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
336; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
337; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
338; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
339; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
340; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
341; GFX10W64-NEXT:  .LBB1_2:
342; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
343; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
344; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
345; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
346; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
348; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
349; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
350; GFX10W64-NEXT:    s_endpgm
351;
352; GFX10W32-LABEL: add_i32_uniform:
353; GFX10W32:       ; %bb.0: ; %entry
354; GFX10W32-NEXT:    s_clause 0x1
355; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
356; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
357; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
358; GFX10W32-NEXT:    ; implicit-def: $vgpr1
359; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
360; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
361; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
362; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
363; GFX10W32-NEXT:  ; %bb.1:
364; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
365; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
366; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
367; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
369; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
370; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
371; GFX10W32-NEXT:  .LBB1_2:
372; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
373; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
374; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
375; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
376; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
377; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
378; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
379; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
380; GFX10W32-NEXT:    s_endpgm
381;
382; GFX11W64-LABEL: add_i32_uniform:
383; GFX11W64:       ; %bb.0: ; %entry
384; GFX11W64-NEXT:    s_clause 0x1
385; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
386; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
387; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
388; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
389; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
390; GFX11W64-NEXT:    ; implicit-def: $vgpr1
391; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
392; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
393; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
394; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
395; GFX11W64-NEXT:  ; %bb.1:
396; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
397; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
398; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
399; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
400; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
401; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
402; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
403; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, v2, s[12:15], 0 idxen glc
404; GFX11W64-NEXT:  .LBB1_2:
405; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
406; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
407; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
408; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
409; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
410; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
411; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
412; GFX11W64-NEXT:    global_store_b32 v0, v1, s[2:3]
413; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
414; GFX11W64-NEXT:    s_endpgm
415;
416; GFX11W32-LABEL: add_i32_uniform:
417; GFX11W32:       ; %bb.0: ; %entry
418; GFX11W32-NEXT:    s_clause 0x1
419; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
420; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
421; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
422; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
423; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
424; GFX11W32-NEXT:    ; implicit-def: $vgpr1
425; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
426; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
427; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
428; GFX11W32-NEXT:  ; %bb.1:
429; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
430; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
431; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
432; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
434; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
435; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
436; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, v2, s[8:11], 0 idxen glc
437; GFX11W32-NEXT:  .LBB1_2:
438; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
439; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
440; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
441; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
443; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
444; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
445; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
446; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
447; GFX11W32-NEXT:    s_endpgm
448entry:
449  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
450  store i32 %old, i32 addrspace(1)* %out
451  ret void
452}
453
454define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
455; GFX6-LABEL: add_i32_varying_vdata:
456; GFX6:       ; %bb.0: ; %entry
457; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
458; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
459; GFX6-NEXT:    v_mov_b32_e32 v1, 0
460; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
461; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
462; GFX6-NEXT:    s_mov_b32 s3, 0xf000
463; GFX6-NEXT:    s_mov_b32 s2, -1
464; GFX6-NEXT:    s_waitcnt vmcnt(0)
465; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
466; GFX6-NEXT:    s_endpgm
467;
468; GFX8-LABEL: add_i32_varying_vdata:
469; GFX8:       ; %bb.0: ; %entry
470; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
471; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
472; GFX8-NEXT:    v_mov_b32_e32 v1, 0
473; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
474; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
475; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
476; GFX8-NEXT:    v_mov_b32_e32 v2, v0
477; GFX8-NEXT:    s_not_b64 exec, exec
478; GFX8-NEXT:    v_mov_b32_e32 v2, 0
479; GFX8-NEXT:    s_not_b64 exec, exec
480; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
481; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
482; GFX8-NEXT:    s_nop 1
483; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
484; GFX8-NEXT:    s_nop 1
485; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
486; GFX8-NEXT:    s_nop 1
487; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
488; GFX8-NEXT:    s_nop 1
489; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
490; GFX8-NEXT:    s_nop 1
491; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
492; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
493; GFX8-NEXT:    s_nop 0
494; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
495; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
496; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
497; GFX8-NEXT:    ; implicit-def: $vgpr0
498; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
499; GFX8-NEXT:    s_cbranch_execz .LBB2_2
500; GFX8-NEXT:  ; %bb.1:
501; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
502; GFX8-NEXT:    v_mov_b32_e32 v0, s6
503; GFX8-NEXT:    v_mov_b32_e32 v3, 0
504; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
505; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
506; GFX8-NEXT:  .LBB2_2:
507; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
508; GFX8-NEXT:    s_waitcnt vmcnt(0)
509; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
510; GFX8-NEXT:    v_mov_b32_e32 v0, v1
511; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
512; GFX8-NEXT:    v_mov_b32_e32 v4, s3
513; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
514; GFX8-NEXT:    v_mov_b32_e32 v3, s2
515; GFX8-NEXT:    flat_store_dword v[3:4], v0
516; GFX8-NEXT:    s_endpgm
517;
518; GFX9-LABEL: add_i32_varying_vdata:
519; GFX9:       ; %bb.0: ; %entry
520; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
521; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
522; GFX9-NEXT:    v_mov_b32_e32 v1, 0
523; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
524; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
525; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
526; GFX9-NEXT:    v_mov_b32_e32 v2, v0
527; GFX9-NEXT:    s_not_b64 exec, exec
528; GFX9-NEXT:    v_mov_b32_e32 v2, 0
529; GFX9-NEXT:    s_not_b64 exec, exec
530; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
531; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
532; GFX9-NEXT:    s_nop 1
533; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
534; GFX9-NEXT:    s_nop 1
535; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
536; GFX9-NEXT:    s_nop 1
537; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
538; GFX9-NEXT:    s_nop 1
539; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
540; GFX9-NEXT:    s_nop 1
541; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
542; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
543; GFX9-NEXT:    s_nop 0
544; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
545; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
546; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
547; GFX9-NEXT:    ; implicit-def: $vgpr0
548; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
549; GFX9-NEXT:    s_cbranch_execz .LBB2_2
550; GFX9-NEXT:  ; %bb.1:
551; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
552; GFX9-NEXT:    v_mov_b32_e32 v0, s6
553; GFX9-NEXT:    v_mov_b32_e32 v3, 0
554; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
556; GFX9-NEXT:  .LBB2_2:
557; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
558; GFX9-NEXT:    s_waitcnt vmcnt(0)
559; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
560; GFX9-NEXT:    v_mov_b32_e32 v0, v1
561; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
562; GFX9-NEXT:    v_mov_b32_e32 v3, 0
563; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
564; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
565; GFX9-NEXT:    s_endpgm
566;
567; GFX10W64-LABEL: add_i32_varying_vdata:
568; GFX10W64:       ; %bb.0: ; %entry
569; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
570; GFX10W64-NEXT:    s_not_b64 exec, exec
571; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
572; GFX10W64-NEXT:    s_not_b64 exec, exec
573; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
574; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
575; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
576; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
577; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
578; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
579; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
580; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
581; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
582; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
583; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
584; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
585; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
586; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
587; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
588; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
589; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
590; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
591; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
592; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
593; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
594; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
595; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
596; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
597; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
598; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
599; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
600; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
601; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
602; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
603; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
604; GFX10W64-NEXT:    ; implicit-def: $vgpr0
605; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
606; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
607; GFX10W64-NEXT:  ; %bb.1:
608; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
609; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
610; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
611; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
612; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
613; GFX10W64-NEXT:  .LBB2_2:
614; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
615; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
616; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
617; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
618; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
619; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
620; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
621; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
622; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
623; GFX10W64-NEXT:    s_endpgm
624;
625; GFX10W32-LABEL: add_i32_varying_vdata:
626; GFX10W32:       ; %bb.0: ; %entry
627; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
628; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
629; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
630; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
631; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
632; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
633; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
634; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
635; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
636; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
637; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
638; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
639; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
640; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
641; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
642; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
643; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
644; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
645; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
646; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
647; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
648; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
649; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
650; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
651; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
652; GFX10W32-NEXT:    ; implicit-def: $vgpr0
653; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
654; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
655; GFX10W32-NEXT:  ; %bb.1:
656; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
657; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
658; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
659; GFX10W32-NEXT:    s_mov_b32 s5, s6
660; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
661; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
662; GFX10W32-NEXT:  .LBB2_2:
663; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
664; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
665; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
666; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
667; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
668; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
669; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
670; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
671; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
672; GFX10W32-NEXT:    s_endpgm
673;
674; GFX11W64-LABEL: add_i32_varying_vdata:
675; GFX11W64:       ; %bb.0: ; %entry
676; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
677; GFX11W64-NEXT:    s_not_b64 exec, exec
678; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
679; GFX11W64-NEXT:    s_not_b64 exec, exec
680; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
681; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
682; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
683; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
684; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
685; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
686; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
687; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
688; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
689; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
690; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
691; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
692; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
693; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
694; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
695; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
696; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
697; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
698; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
699; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
700; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
701; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
702; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
703; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
704; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
705; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
706; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
707; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
708; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
709; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
710; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
711; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
712; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
713; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
714; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
715; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
716; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
717; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
718; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
719; GFX11W64-NEXT:    ; implicit-def: $vgpr0
720; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
721; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
722; GFX11W64-NEXT:  ; %bb.1:
723; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
724; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
725; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
726; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
727; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
728; GFX11W64-NEXT:  .LBB2_2:
729; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
730; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
731; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
732; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
733; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
734; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
735; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
736; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
737; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
738; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
739; GFX11W64-NEXT:    s_endpgm
740;
741; GFX11W32-LABEL: add_i32_varying_vdata:
742; GFX11W32:       ; %bb.0: ; %entry
743; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
744; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
745; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
746; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
747; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
748; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
749; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
750; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
751; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
752; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
753; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
754; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
755; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
756; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
757; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
758; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
759; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
760; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
761; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
762; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
763; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
764; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
765; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
766; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
767; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
768; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
769; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
770; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
771; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
772; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
773; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
774; GFX11W32-NEXT:    ; implicit-def: $vgpr0
775; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
776; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
777; GFX11W32-NEXT:  ; %bb.1:
778; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
779; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
780; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
781; GFX11W32-NEXT:    s_mov_b32 s5, s6
782; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
783; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
784; GFX11W32-NEXT:  .LBB2_2:
785; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
786; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
787; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
788; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
789; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
790; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
791; GFX11W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
792; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
794; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
795; GFX11W32-NEXT:    s_endpgm
796entry:
797  %lane = call i32 @llvm.amdgcn.workitem.id.x()
798  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
799  store i32 %old, i32 addrspace(1)* %out
800  ret void
801}
802
803define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
804; GFX6-LABEL: add_i32_varying_vindex:
805; GFX6:       ; %bb.0: ; %entry
806; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
807; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
808; GFX6-NEXT:    v_mov_b32_e32 v1, 1
809; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
810; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
811; GFX6-NEXT:    s_mov_b32 s3, 0xf000
812; GFX6-NEXT:    s_mov_b32 s2, -1
813; GFX6-NEXT:    s_waitcnt vmcnt(0)
814; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
815; GFX6-NEXT:    s_endpgm
816;
817; GFX8-LABEL: add_i32_varying_vindex:
818; GFX8:       ; %bb.0: ; %entry
819; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
820; GFX8-NEXT:    v_mov_b32_e32 v2, 1
821; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
822; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
824; GFX8-NEXT:    v_mov_b32_e32 v0, s0
825; GFX8-NEXT:    v_mov_b32_e32 v1, s1
826; GFX8-NEXT:    s_waitcnt vmcnt(0)
827; GFX8-NEXT:    flat_store_dword v[0:1], v2
828; GFX8-NEXT:    s_endpgm
829;
830; GFX9-LABEL: add_i32_varying_vindex:
831; GFX9:       ; %bb.0: ; %entry
832; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
833; GFX9-NEXT:    v_mov_b32_e32 v1, 1
834; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
835; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
836; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
837; GFX9-NEXT:    v_mov_b32_e32 v0, 0
838; GFX9-NEXT:    s_waitcnt vmcnt(0)
839; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
840; GFX9-NEXT:    s_endpgm
841;
842; GFX10-LABEL: add_i32_varying_vindex:
843; GFX10:       ; %bb.0: ; %entry
844; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
845; GFX10-NEXT:    v_mov_b32_e32 v1, 1
846; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
847; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
848; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
849; GFX10-NEXT:    v_mov_b32_e32 v0, 0
850; GFX10-NEXT:    s_waitcnt vmcnt(0)
851; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
852; GFX10-NEXT:    s_endpgm
853;
854; GFX11-LABEL: add_i32_varying_vindex:
855; GFX11:       ; %bb.0: ; %entry
856; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
857; GFX11-NEXT:    v_mov_b32_e32 v1, 1
858; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
859; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
860; GFX11-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], 0 idxen glc
861; GFX11-NEXT:    v_mov_b32_e32 v0, 0
862; GFX11-NEXT:    s_waitcnt vmcnt(0)
863; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
864; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
865; GFX11-NEXT:    s_endpgm
866entry:
867  %lane = call i32 @llvm.amdgcn.workitem.id.x()
868  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
869  store i32 %old, i32 addrspace(1)* %out
870  ret void
871}
872
873define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
874; GFX6-LABEL: add_i32_varying_offset:
875; GFX6:       ; %bb.0: ; %entry
876; GFX6-NEXT:    v_mov_b32_e32 v1, v0
877; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
878; GFX6-NEXT:    s_mov_b32 s2, 0
879; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
880; GFX6-NEXT:    v_mov_b32_e32 v0, s2
881; GFX6-NEXT:    v_mov_b32_e32 v2, 1
882; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
884; GFX6-NEXT:    s_mov_b32 s3, 0xf000
885; GFX6-NEXT:    s_mov_b32 s2, -1
886; GFX6-NEXT:    s_waitcnt vmcnt(0)
887; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
888; GFX6-NEXT:    s_endpgm
889;
890; GFX8-LABEL: add_i32_varying_offset:
891; GFX8:       ; %bb.0: ; %entry
892; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
893; GFX8-NEXT:    s_mov_b32 s2, 0
894; GFX8-NEXT:    v_mov_b32_e32 v1, v0
895; GFX8-NEXT:    v_mov_b32_e32 v0, s2
896; GFX8-NEXT:    v_mov_b32_e32 v2, 1
897; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX8-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
899; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
900; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX8-NEXT:    v_mov_b32_e32 v0, s0
902; GFX8-NEXT:    v_mov_b32_e32 v1, s1
903; GFX8-NEXT:    s_waitcnt vmcnt(0)
904; GFX8-NEXT:    flat_store_dword v[0:1], v2
905; GFX8-NEXT:    s_endpgm
906;
907; GFX9-LABEL: add_i32_varying_offset:
908; GFX9:       ; %bb.0: ; %entry
909; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
910; GFX9-NEXT:    s_mov_b32 s2, 0
911; GFX9-NEXT:    v_mov_b32_e32 v1, v0
912; GFX9-NEXT:    v_mov_b32_e32 v0, s2
913; GFX9-NEXT:    v_mov_b32_e32 v2, 1
914; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
915; GFX9-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
916; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
917; GFX9-NEXT:    v_mov_b32_e32 v0, 0
918; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
919; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
920; GFX9-NEXT:    s_endpgm
921;
922; GFX10-LABEL: add_i32_varying_offset:
923; GFX10:       ; %bb.0: ; %entry
924; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
925; GFX10-NEXT:    s_mov_b32 s2, 0
926; GFX10-NEXT:    v_mov_b32_e32 v1, v0
927; GFX10-NEXT:    v_mov_b32_e32 v0, s2
928; GFX10-NEXT:    v_mov_b32_e32 v2, 1
929; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
930; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
931; GFX10-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
932; GFX10-NEXT:    v_mov_b32_e32 v0, 0
933; GFX10-NEXT:    s_waitcnt vmcnt(0)
934; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
935; GFX10-NEXT:    s_endpgm
936;
937; GFX11W64-LABEL: add_i32_varying_offset:
938; GFX11W64:       ; %bb.0: ; %entry
939; GFX11W64-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
940; GFX11W64-NEXT:    s_mov_b32 s2, 0
941; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
942; GFX11W64-NEXT:    v_mov_b32_e32 v0, s2
943; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
944; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
945; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX11W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
947; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
948; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
949; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
950; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
951; GFX11W64-NEXT:    s_endpgm
952;
953; GFX11W32-LABEL: add_i32_varying_offset:
954; GFX11W32:       ; %bb.0: ; %entry
955; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
956; GFX11W32-NEXT:    s_mov_b32 s2, 0
957; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
958; GFX11W32-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
959; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
960; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
961; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
962; GFX11W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
963; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
964; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
965; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
966; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
967; GFX11W32-NEXT:    s_endpgm
968entry:
969  %lane = call i32 @llvm.amdgcn.workitem.id.x()
970  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
971  store i32 %old, i32 addrspace(1)* %out
972  ret void
973}
974
975define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
976; GFX6-LABEL: sub_i32_constant:
977; GFX6:       ; %bb.0: ; %entry
978; GFX6-NEXT:    s_mov_b64 s[6:7], exec
979; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
980; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
981; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
982; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
983; GFX6-NEXT:    ; implicit-def: $vgpr1
984; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
985; GFX6-NEXT:    s_cbranch_execz .LBB5_2
986; GFX6-NEXT:  ; %bb.1:
987; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
988; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
989; GFX6-NEXT:    s_mul_i32 s0, s0, 5
990; GFX6-NEXT:    v_mov_b32_e32 v1, s0
991; GFX6-NEXT:    v_mov_b32_e32 v2, 0
992; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
994; GFX6-NEXT:  .LBB5_2:
995; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
996; GFX6-NEXT:    s_waitcnt vmcnt(0)
997; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
998; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
999; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1000; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1001; GFX6-NEXT:    s_mov_b32 s6, -1
1002; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1003; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1004; GFX6-NEXT:    s_endpgm
1005;
1006; GFX8-LABEL: sub_i32_constant:
1007; GFX8:       ; %bb.0: ; %entry
1008; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1009; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1010; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1011; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1012; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1013; GFX8-NEXT:    ; implicit-def: $vgpr1
1014; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1015; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1016; GFX8-NEXT:  ; %bb.1:
1017; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1018; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1019; GFX8-NEXT:    s_mul_i32 s0, s0, 5
1020; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1021; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1022; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1023; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1024; GFX8-NEXT:  .LBB5_2:
1025; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1026; GFX8-NEXT:    s_waitcnt vmcnt(0)
1027; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1028; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1029; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1030; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1031; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1032; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1033; GFX8-NEXT:    flat_store_dword v[0:1], v2
1034; GFX8-NEXT:    s_endpgm
1035;
1036; GFX9-LABEL: sub_i32_constant:
1037; GFX9:       ; %bb.0: ; %entry
1038; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1039; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1040; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1041; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1042; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1043; GFX9-NEXT:    ; implicit-def: $vgpr1
1044; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1045; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1046; GFX9-NEXT:  ; %bb.1:
1047; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1048; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1049; GFX9-NEXT:    s_mul_i32 s0, s0, 5
1050; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1051; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1052; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1053; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1054; GFX9-NEXT:  .LBB5_2:
1055; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1056; GFX9-NEXT:    s_waitcnt vmcnt(0)
1057; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1058; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1059; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1060; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1061; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1062; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1063; GFX9-NEXT:    s_endpgm
1064;
1065; GFX10W64-LABEL: sub_i32_constant:
1066; GFX10W64:       ; %bb.0: ; %entry
1067; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1068; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
1069; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1070; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1071; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1072; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1073; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1074; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1075; GFX10W64-NEXT:  ; %bb.1:
1076; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1077; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1078; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
1079; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
1080; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1081; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1082; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1083; GFX10W64-NEXT:  .LBB5_2:
1084; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1085; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1086; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1087; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1088; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1089; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1090; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1091; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1092; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1093; GFX10W64-NEXT:    s_endpgm
1094;
1095; GFX10W32-LABEL: sub_i32_constant:
1096; GFX10W32:       ; %bb.0: ; %entry
1097; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1098; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
1099; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1100; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1101; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1102; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1103; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1104; GFX10W32-NEXT:  ; %bb.1:
1105; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1106; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
1107; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
1108; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
1109; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1110; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1112; GFX10W32-NEXT:  .LBB5_2:
1113; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1114; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1115; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1116; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1117; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1118; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1119; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1120; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1121; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1122; GFX10W32-NEXT:    s_endpgm
1123;
1124; GFX11W64-LABEL: sub_i32_constant:
1125; GFX11W64:       ; %bb.0: ; %entry
1126; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1127; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1128; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1129; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1130; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1131; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1132; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1133; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1134; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1135; GFX11W64-NEXT:  ; %bb.1:
1136; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1137; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1138; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
1139; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
1140; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1141; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1142; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1143; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1144; GFX11W64-NEXT:  .LBB5_2:
1145; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1146; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1147; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1148; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1149; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1150; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1151; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1152; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1153; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1154; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1155; GFX11W64-NEXT:    s_endpgm
1156;
1157; GFX11W32-LABEL: sub_i32_constant:
1158; GFX11W32:       ; %bb.0: ; %entry
1159; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1160; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1161; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
1162; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1163; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1164; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1165; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1166; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1167; GFX11W32-NEXT:  ; %bb.1:
1168; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1169; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
1170; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
1171; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
1172; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1173; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1174; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1175; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1176; GFX11W32-NEXT:  .LBB5_2:
1177; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1178; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1179; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1180; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1181; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1182; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1183; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1184; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1186; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1187; GFX11W32-NEXT:    s_endpgm
1188entry:
1189  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
1190  store i32 %old, i32 addrspace(1)* %out
1191  ret void
1192}
1193
1194define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
1195; GFX6-LABEL: sub_i32_uniform:
1196; GFX6:       ; %bb.0: ; %entry
1197; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1198; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1199; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
1200; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1201; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1202; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1203; GFX6-NEXT:    ; implicit-def: $vgpr1
1204; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1205; GFX6-NEXT:    s_cbranch_execz .LBB6_2
1206; GFX6-NEXT:  ; %bb.1:
1207; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
1208; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
1209; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1210; GFX6-NEXT:    s_mul_i32 s0, s8, s0
1211; GFX6-NEXT:    v_mov_b32_e32 v1, s0
1212; GFX6-NEXT:    v_mov_b32_e32 v2, 0
1213; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
1214; GFX6-NEXT:  .LBB6_2:
1215; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
1216; GFX6-NEXT:    s_waitcnt vmcnt(0)
1217; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
1218; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1219; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
1220; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1221; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1222; GFX6-NEXT:    s_mov_b32 s6, -1
1223; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1224; GFX6-NEXT:    s_endpgm
1225;
1226; GFX8-LABEL: sub_i32_uniform:
1227; GFX8:       ; %bb.0: ; %entry
1228; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1229; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
1230; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1231; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1232; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1233; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1234; GFX8-NEXT:    ; implicit-def: $vgpr1
1235; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1236; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1237; GFX8-NEXT:  ; %bb.1:
1238; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1239; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1240; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1241; GFX8-NEXT:    s_mul_i32 s0, s8, s0
1242; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1243; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1244; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
1245; GFX8-NEXT:  .LBB6_2:
1246; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1247; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1248; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1249; GFX8-NEXT:    s_waitcnt vmcnt(0)
1250; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1251; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1252; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1253; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1254; GFX8-NEXT:    flat_store_dword v[0:1], v2
1255; GFX8-NEXT:    s_endpgm
1256;
1257; GFX9-LABEL: sub_i32_uniform:
1258; GFX9:       ; %bb.0: ; %entry
1259; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1260; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
1261; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1262; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1263; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1264; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1265; GFX9-NEXT:    ; implicit-def: $vgpr1
1266; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1267; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1268; GFX9-NEXT:  ; %bb.1:
1269; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1270; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1271; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX9-NEXT:    s_mul_i32 s0, s8, s0
1273; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1274; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1275; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
1276; GFX9-NEXT:  .LBB6_2:
1277; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1278; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1279; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1280; GFX9-NEXT:    s_waitcnt vmcnt(0)
1281; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1282; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1283; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1284; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1285; GFX9-NEXT:    s_endpgm
1286;
1287; GFX10W64-LABEL: sub_i32_uniform:
1288; GFX10W64:       ; %bb.0: ; %entry
1289; GFX10W64-NEXT:    s_clause 0x1
1290; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1291; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
1292; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
1293; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1294; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1295; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1296; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1297; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1298; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1299; GFX10W64-NEXT:  ; %bb.1:
1300; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1301; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1302; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
1303; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1304; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
1305; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1306; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
1307; GFX10W64-NEXT:  .LBB6_2:
1308; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1309; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1310; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1312; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1313; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1314; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1315; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1316; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1317; GFX10W64-NEXT:    s_endpgm
1318;
1319; GFX10W32-LABEL: sub_i32_uniform:
1320; GFX10W32:       ; %bb.0: ; %entry
1321; GFX10W32-NEXT:    s_clause 0x1
1322; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1323; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
1324; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
1325; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1326; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1327; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1328; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1329; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1330; GFX10W32-NEXT:  ; %bb.1:
1331; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1332; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1333; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
1334; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1335; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
1336; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1337; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
1338; GFX10W32-NEXT:  .LBB6_2:
1339; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1340; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1341; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1342; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1343; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1344; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1345; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1346; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1347; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1348; GFX10W32-NEXT:    s_endpgm
1349;
1350; GFX11W64-LABEL: sub_i32_uniform:
1351; GFX11W64:       ; %bb.0: ; %entry
1352; GFX11W64-NEXT:    s_clause 0x1
1353; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1354; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
1355; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1356; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1357; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1358; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1359; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1360; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1361; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1362; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
1363; GFX11W64-NEXT:  ; %bb.1:
1364; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
1365; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1366; GFX11W64-NEXT:    v_mov_b32_e32 v2, 0
1367; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1368; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
1369; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1370; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1371; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, v2, s[12:15], 0 idxen glc
1372; GFX11W64-NEXT:  .LBB6_2:
1373; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1374; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX11W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1376; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1377; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1378; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1379; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1380; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1381; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1382; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1383; GFX11W64-NEXT:    s_endpgm
1384;
1385; GFX11W32-LABEL: sub_i32_uniform:
1386; GFX11W32:       ; %bb.0: ; %entry
1387; GFX11W32-NEXT:    s_clause 0x1
1388; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1389; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
1390; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
1391; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1392; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1393; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1394; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1395; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1396; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
1397; GFX11W32-NEXT:  ; %bb.1:
1398; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1399; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1400; GFX11W32-NEXT:    v_mov_b32_e32 v2, 0
1401; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1402; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
1403; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1404; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1405; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, v2, s[8:11], 0 idxen glc
1406; GFX11W32-NEXT:  .LBB6_2:
1407; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1408; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1409; GFX11W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1410; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1411; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1412; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1413; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1414; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1415; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1416; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1417; GFX11W32-NEXT:    s_endpgm
1418entry:
1419  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
1420  store i32 %old, i32 addrspace(1)* %out
1421  ret void
1422}
1423
1424define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
1425; GFX6-LABEL: sub_i32_varying_vdata:
1426; GFX6:       ; %bb.0: ; %entry
1427; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1428; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1429; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1430; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1431; GFX6-NEXT:    buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc
1432; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1433; GFX6-NEXT:    s_mov_b32 s2, -1
1434; GFX6-NEXT:    s_waitcnt vmcnt(0)
1435; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1436; GFX6-NEXT:    s_endpgm
1437;
1438; GFX8-LABEL: sub_i32_varying_vdata:
1439; GFX8:       ; %bb.0: ; %entry
1440; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1441; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1442; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1443; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1444; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1445; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1446; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1447; GFX8-NEXT:    s_not_b64 exec, exec
1448; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1449; GFX8-NEXT:    s_not_b64 exec, exec
1450; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1451; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1452; GFX8-NEXT:    s_nop 1
1453; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1454; GFX8-NEXT:    s_nop 1
1455; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1456; GFX8-NEXT:    s_nop 1
1457; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1458; GFX8-NEXT:    s_nop 1
1459; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1460; GFX8-NEXT:    s_nop 1
1461; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1462; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1463; GFX8-NEXT:    s_nop 0
1464; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1465; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1466; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1467; GFX8-NEXT:    ; implicit-def: $vgpr0
1468; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1469; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1470; GFX8-NEXT:  ; %bb.1:
1471; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1472; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1473; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1474; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1475; GFX8-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1476; GFX8-NEXT:  .LBB7_2:
1477; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1478; GFX8-NEXT:    s_waitcnt vmcnt(0)
1479; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1480; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1481; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1482; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1483; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1484; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1485; GFX8-NEXT:    flat_store_dword v[3:4], v0
1486; GFX8-NEXT:    s_endpgm
1487;
1488; GFX9-LABEL: sub_i32_varying_vdata:
1489; GFX9:       ; %bb.0: ; %entry
1490; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1491; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1492; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1493; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1494; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1495; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1496; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1497; GFX9-NEXT:    s_not_b64 exec, exec
1498; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1499; GFX9-NEXT:    s_not_b64 exec, exec
1500; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1501; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1502; GFX9-NEXT:    s_nop 1
1503; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1504; GFX9-NEXT:    s_nop 1
1505; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1506; GFX9-NEXT:    s_nop 1
1507; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1508; GFX9-NEXT:    s_nop 1
1509; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1510; GFX9-NEXT:    s_nop 1
1511; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1512; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1513; GFX9-NEXT:    s_nop 0
1514; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1515; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1516; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1517; GFX9-NEXT:    ; implicit-def: $vgpr0
1518; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1519; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1520; GFX9-NEXT:  ; %bb.1:
1521; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1522; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1523; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1524; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1525; GFX9-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1526; GFX9-NEXT:  .LBB7_2:
1527; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1528; GFX9-NEXT:    s_waitcnt vmcnt(0)
1529; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1530; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1531; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1532; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1533; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1534; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1535; GFX9-NEXT:    s_endpgm
1536;
1537; GFX10W64-LABEL: sub_i32_varying_vdata:
1538; GFX10W64:       ; %bb.0: ; %entry
1539; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1540; GFX10W64-NEXT:    s_not_b64 exec, exec
1541; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1542; GFX10W64-NEXT:    s_not_b64 exec, exec
1543; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1544; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1545; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1546; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1547; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1548; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1549; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1550; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1551; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1552; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1553; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1554; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1555; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1556; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1557; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1558; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1559; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1560; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1561; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1562; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1563; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1564; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1565; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1566; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1567; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1568; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1569; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1570; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1571; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1572; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1573; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1574; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1575; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1576; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1577; GFX10W64-NEXT:  ; %bb.1:
1578; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1579; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1580; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1581; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1582; GFX10W64-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1583; GFX10W64-NEXT:  .LBB7_2:
1584; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1585; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1586; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1587; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1588; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1589; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1590; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1591; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1592; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1593; GFX10W64-NEXT:    s_endpgm
1594;
1595; GFX10W32-LABEL: sub_i32_varying_vdata:
1596; GFX10W32:       ; %bb.0: ; %entry
1597; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1598; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1599; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1600; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1601; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1602; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1603; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1604; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1605; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1606; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1607; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1608; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1609; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1610; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1611; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1612; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1613; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1614; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1615; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1616; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1617; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1618; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1619; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1620; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1621; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1622; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1623; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1624; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1625; GFX10W32-NEXT:  ; %bb.1:
1626; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1627; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1628; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1629; GFX10W32-NEXT:    s_mov_b32 s5, s6
1630; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1631; GFX10W32-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1632; GFX10W32-NEXT:  .LBB7_2:
1633; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1634; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1635; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1636; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1637; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1638; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1639; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1640; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1641; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1642; GFX10W32-NEXT:    s_endpgm
1643;
1644; GFX11W64-LABEL: sub_i32_varying_vdata:
1645; GFX11W64:       ; %bb.0: ; %entry
1646; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
1647; GFX11W64-NEXT:    s_not_b64 exec, exec
1648; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1649; GFX11W64-NEXT:    s_not_b64 exec, exec
1650; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1651; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1652; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1653; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
1654; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1655; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1656; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1657; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1658; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1659; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
1660; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1661; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1662; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1663; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
1664; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1665; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
1666; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1667; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1668; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
1669; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1670; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
1671; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1672; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1673; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
1674; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
1675; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1676; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1677; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1678; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1679; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
1680; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
1681; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
1682; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1683; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1684; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1685; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1686; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
1687; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1688; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1689; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1690; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1691; GFX11W64-NEXT:    s_cbranch_execz .LBB7_2
1692; GFX11W64-NEXT:  ; %bb.1:
1693; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1694; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
1695; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
1696; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc
1698; GFX11W64-NEXT:  .LBB7_2:
1699; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1700; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1701; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
1702; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
1703; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
1704; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1705; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1706; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1707; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
1708; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1709; GFX11W64-NEXT:    s_endpgm
1710;
1711; GFX11W32-LABEL: sub_i32_varying_vdata:
1712; GFX11W32:       ; %bb.0: ; %entry
1713; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
1714; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1715; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1716; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1717; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
1718; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1719; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1720; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1721; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1722; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1723; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1724; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1725; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
1726; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1727; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
1728; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1729; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1730; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1731; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1732; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
1733; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
1734; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1735; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1736; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
1737; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1738; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1739; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1740; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1741; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
1742; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1743; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1744; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1745; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1746; GFX11W32-NEXT:    s_cbranch_execz .LBB7_2
1747; GFX11W32-NEXT:  ; %bb.1:
1748; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1749; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
1750; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
1751; GFX11W32-NEXT:    s_mov_b32 s5, s6
1752; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1753; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, v4, s[8:11], 0 idxen glc
1754; GFX11W32-NEXT:  .LBB7_2:
1755; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1756; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1757; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
1758; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
1759; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
1760; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1761; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1762; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1763; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
1764; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1765; GFX11W32-NEXT:    s_endpgm
1766entry:
1767  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1768  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
1769  store i32 %old, i32 addrspace(1)* %out
1770  ret void
1771}
1772
1773define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
1774; GFX6-LABEL: sub_i32_varying_vindex:
1775; GFX6:       ; %bb.0: ; %entry
1776; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1777; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1778; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1779; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1780; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1781; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1782; GFX6-NEXT:    s_mov_b32 s2, -1
1783; GFX6-NEXT:    s_waitcnt vmcnt(0)
1784; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1785; GFX6-NEXT:    s_endpgm
1786;
1787; GFX8-LABEL: sub_i32_varying_vindex:
1788; GFX8:       ; %bb.0: ; %entry
1789; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1790; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1791; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1792; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1793; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
1794; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1795; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1796; GFX8-NEXT:    s_waitcnt vmcnt(0)
1797; GFX8-NEXT:    flat_store_dword v[0:1], v2
1798; GFX8-NEXT:    s_endpgm
1799;
1800; GFX9-LABEL: sub_i32_varying_vindex:
1801; GFX9:       ; %bb.0: ; %entry
1802; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1803; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1804; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1805; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1806; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1807; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1808; GFX9-NEXT:    s_waitcnt vmcnt(0)
1809; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1810; GFX9-NEXT:    s_endpgm
1811;
1812; GFX10-LABEL: sub_i32_varying_vindex:
1813; GFX10:       ; %bb.0: ; %entry
1814; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1815; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1816; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1817; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1818; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1819; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1820; GFX10-NEXT:    s_waitcnt vmcnt(0)
1821; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1822; GFX10-NEXT:    s_endpgm
1823;
1824; GFX11-LABEL: sub_i32_varying_vindex:
1825; GFX11:       ; %bb.0: ; %entry
1826; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
1827; GFX11-NEXT:    v_mov_b32_e32 v1, 1
1828; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1829; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1830; GFX11-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], 0 idxen glc
1831; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1832; GFX11-NEXT:    s_waitcnt vmcnt(0)
1833; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1834; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1835; GFX11-NEXT:    s_endpgm
1836entry:
1837  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1838  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
1839  store i32 %old, i32 addrspace(1)* %out
1840  ret void
1841}
1842
1843define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1844; GFX6-LABEL: sub_i32_varying_offset:
1845; GFX6:       ; %bb.0: ; %entry
1846; GFX6-NEXT:    v_mov_b32_e32 v1, v0
1847; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1848; GFX6-NEXT:    s_mov_b32 s2, 0
1849; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1850; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1851; GFX6-NEXT:    v_mov_b32_e32 v2, 1
1852; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1853; GFX6-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1854; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1855; GFX6-NEXT:    s_mov_b32 s2, -1
1856; GFX6-NEXT:    s_waitcnt vmcnt(0)
1857; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
1858; GFX6-NEXT:    s_endpgm
1859;
1860; GFX8-LABEL: sub_i32_varying_offset:
1861; GFX8:       ; %bb.0: ; %entry
1862; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1863; GFX8-NEXT:    s_mov_b32 s2, 0
1864; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1865; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1866; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1867; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1868; GFX8-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1869; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1870; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1871; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1872; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1873; GFX8-NEXT:    s_waitcnt vmcnt(0)
1874; GFX8-NEXT:    flat_store_dword v[0:1], v2
1875; GFX8-NEXT:    s_endpgm
1876;
1877; GFX9-LABEL: sub_i32_varying_offset:
1878; GFX9:       ; %bb.0: ; %entry
1879; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1880; GFX9-NEXT:    s_mov_b32 s2, 0
1881; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1882; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1883; GFX9-NEXT:    v_mov_b32_e32 v2, 1
1884; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1885; GFX9-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1886; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1887; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1888; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1889; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1890; GFX9-NEXT:    s_endpgm
1891;
1892; GFX10-LABEL: sub_i32_varying_offset:
1893; GFX10:       ; %bb.0: ; %entry
1894; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1895; GFX10-NEXT:    s_mov_b32 s2, 0
1896; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1897; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1898; GFX10-NEXT:    v_mov_b32_e32 v2, 1
1899; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1900; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1901; GFX10-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1902; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1903; GFX10-NEXT:    s_waitcnt vmcnt(0)
1904; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
1905; GFX10-NEXT:    s_endpgm
1906;
1907; GFX11W64-LABEL: sub_i32_varying_offset:
1908; GFX11W64:       ; %bb.0: ; %entry
1909; GFX11W64-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
1910; GFX11W64-NEXT:    s_mov_b32 s2, 0
1911; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
1912; GFX11W64-NEXT:    v_mov_b32_e32 v0, s2
1913; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
1914; GFX11W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1915; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1916; GFX11W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
1917; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
1918; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1919; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
1920; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1921; GFX11W64-NEXT:    s_endpgm
1922;
1923; GFX11W32-LABEL: sub_i32_varying_offset:
1924; GFX11W32:       ; %bb.0: ; %entry
1925; GFX11W32-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
1926; GFX11W32-NEXT:    s_mov_b32 s2, 0
1927; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1928; GFX11W32-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s2
1929; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
1930; GFX11W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1931; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1932; GFX11W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[4:7], 0 idxen offen glc
1933; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
1934; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1935; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
1936; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1937; GFX11W32-NEXT:    s_endpgm
1938entry:
1939  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1940  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
1941  store i32 %old, i32 addrspace(1)* %out
1942  ret void
1943}
1944