1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
10declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)
11
12; Show what the atomic optimization pass will do for struct buffers.
13
14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
15; GFX6-LABEL: add_i32_constant:
16; GFX6:       ; %bb.0: ; %entry
17; GFX6-NEXT:    s_mov_b64 s[6:7], exec
18; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
19; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
20; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
21; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
22; GFX6-NEXT:    ; implicit-def: $vgpr1
23; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
24; GFX6-NEXT:    s_cbranch_execz .LBB0_2
25; GFX6-NEXT:  ; %bb.1:
26; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
27; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
28; GFX6-NEXT:    s_mul_i32 s0, s0, 5
29; GFX6-NEXT:    v_mov_b32_e32 v1, s0
30; GFX6-NEXT:    v_mov_b32_e32 v2, 0
31; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX6-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
33; GFX6-NEXT:  .LBB0_2:
34; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
37; GFX6-NEXT:    s_mov_b32 s7, 0xf000
38; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
39; GFX6-NEXT:    s_mov_b32 s6, -1
40; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
42; GFX6-NEXT:    s_endpgm
43;
44; GFX8-LABEL: add_i32_constant:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
47; GFX8-NEXT:    s_mov_b64 s[6:7], exec
48; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
49; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
50; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
51; GFX8-NEXT:    ; implicit-def: $vgpr1
52; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
53; GFX8-NEXT:    s_cbranch_execz .LBB0_2
54; GFX8-NEXT:  ; %bb.1:
55; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
56; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
57; GFX8-NEXT:    s_mul_i32 s0, s0, 5
58; GFX8-NEXT:    v_mov_b32_e32 v1, s0
59; GFX8-NEXT:    v_mov_b32_e32 v2, 0
60; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX8-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
62; GFX8-NEXT:  .LBB0_2:
63; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
66; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mov_b32_e32 v0, s2
69; GFX8-NEXT:    v_mov_b32_e32 v1, s3
70; GFX8-NEXT:    flat_store_dword v[0:1], v2
71; GFX8-NEXT:    s_endpgm
72;
73; GFX9-LABEL: add_i32_constant:
74; GFX9:       ; %bb.0: ; %entry
75; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
76; GFX9-NEXT:    s_mov_b64 s[6:7], exec
77; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
78; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
79; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
80; GFX9-NEXT:    ; implicit-def: $vgpr1
81; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
82; GFX9-NEXT:    s_cbranch_execz .LBB0_2
83; GFX9-NEXT:  ; %bb.1:
84; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
85; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
86; GFX9-NEXT:    s_mul_i32 s0, s0, 5
87; GFX9-NEXT:    v_mov_b32_e32 v1, s0
88; GFX9-NEXT:    v_mov_b32_e32 v2, 0
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
91; GFX9-NEXT:  .LBB0_2:
92; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
93; GFX9-NEXT:    s_waitcnt vmcnt(0)
94; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
95; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
96; GFX9-NEXT:    v_mov_b32_e32 v1, 0
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
99; GFX9-NEXT:    s_endpgm
100;
101; GFX10W64-LABEL: add_i32_constant:
102; GFX10W64:       ; %bb.0: ; %entry
103; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
105; GFX10W64-NEXT:    ; implicit-def: $vgpr1
106; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
107; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
108; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
109; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
110; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
111; GFX10W64-NEXT:  ; %bb.1:
112; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
113; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
114; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
115; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
116; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
117; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
119; GFX10W64-NEXT:  .LBB0_2:
120; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
121; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
122; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
123; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
124; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
125; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
126; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
128; GFX10W64-NEXT:    s_endpgm
129;
130; GFX10W32-LABEL: add_i32_constant:
131; GFX10W32:       ; %bb.0: ; %entry
132; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
133; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
134; GFX10W32-NEXT:    ; implicit-def: $vgpr1
135; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
136; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
137; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
138; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
139; GFX10W32-NEXT:  ; %bb.1:
140; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
141; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
142; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
143; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
144; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
145; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
147; GFX10W32-NEXT:  .LBB0_2:
148; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
149; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
150; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
151; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
152; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
153; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
154; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
156; GFX10W32-NEXT:    s_endpgm
157entry:
158  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
159  store i32 %old, i32 addrspace(1)* %out
160  ret void
161}
162
163define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
164; GFX6-LABEL: add_i32_uniform:
165; GFX6:       ; %bb.0: ; %entry
166; GFX6-NEXT:    s_mov_b64 s[2:3], exec
167; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
168; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
169; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
170; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
171; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
172; GFX6-NEXT:    ; implicit-def: $vgpr1
173; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
174; GFX6-NEXT:    s_cbranch_execz .LBB1_2
175; GFX6-NEXT:  ; %bb.1:
176; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
177; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
178; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX6-NEXT:    s_mul_i32 s0, s8, s0
180; GFX6-NEXT:    v_mov_b32_e32 v1, s0
181; GFX6-NEXT:    v_mov_b32_e32 v2, 0
182; GFX6-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
183; GFX6-NEXT:  .LBB1_2:
184; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
185; GFX6-NEXT:    s_waitcnt vmcnt(0)
186; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
187; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
189; GFX6-NEXT:    s_mov_b32 s7, 0xf000
190; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
191; GFX6-NEXT:    s_mov_b32 s6, -1
192; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
193; GFX6-NEXT:    s_endpgm
194;
195; GFX8-LABEL: add_i32_uniform:
196; GFX8:       ; %bb.0: ; %entry
197; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
198; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
199; GFX8-NEXT:    s_mov_b64 s[6:7], exec
200; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
201; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
202; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
203; GFX8-NEXT:    ; implicit-def: $vgpr1
204; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
205; GFX8-NEXT:    s_cbranch_execz .LBB1_2
206; GFX8-NEXT:  ; %bb.1:
207; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
208; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX8-NEXT:    s_mul_i32 s0, s8, s0
211; GFX8-NEXT:    v_mov_b32_e32 v1, s0
212; GFX8-NEXT:    v_mov_b32_e32 v2, 0
213; GFX8-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
214; GFX8-NEXT:  .LBB1_2:
215; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
216; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
218; GFX8-NEXT:    s_waitcnt vmcnt(0)
219; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
220; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
221; GFX8-NEXT:    v_mov_b32_e32 v0, s2
222; GFX8-NEXT:    v_mov_b32_e32 v1, s3
223; GFX8-NEXT:    flat_store_dword v[0:1], v2
224; GFX8-NEXT:    s_endpgm
225;
226; GFX9-LABEL: add_i32_uniform:
227; GFX9:       ; %bb.0: ; %entry
228; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
229; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
230; GFX9-NEXT:    s_mov_b64 s[6:7], exec
231; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
232; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
233; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
234; GFX9-NEXT:    ; implicit-def: $vgpr1
235; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
236; GFX9-NEXT:    s_cbranch_execz .LBB1_2
237; GFX9-NEXT:  ; %bb.1:
238; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
239; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    s_mul_i32 s0, s8, s0
242; GFX9-NEXT:    v_mov_b32_e32 v1, s0
243; GFX9-NEXT:    v_mov_b32_e32 v2, 0
244; GFX9-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
245; GFX9-NEXT:  .LBB1_2:
246; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
249; GFX9-NEXT:    s_waitcnt vmcnt(0)
250; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
251; GFX9-NEXT:    v_mov_b32_e32 v1, 0
252; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
253; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
254; GFX9-NEXT:    s_endpgm
255;
256; GFX10W64-LABEL: add_i32_uniform:
257; GFX10W64:       ; %bb.0: ; %entry
258; GFX10W64-NEXT:    s_clause 0x1
259; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
260; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
261; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
262; GFX10W64-NEXT:    ; implicit-def: $vgpr1
263; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
264; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
265; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
266; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
267; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
268; GFX10W64-NEXT:  ; %bb.1:
269; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
270; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
271; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
272; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
274; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
275; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
276; GFX10W64-NEXT:  .LBB1_2:
277; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
278; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
279; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
280; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
281; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
282; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1]
283; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
284; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
285; GFX10W64-NEXT:    s_endpgm
286;
287; GFX10W32-LABEL: add_i32_uniform:
288; GFX10W32:       ; %bb.0: ; %entry
289; GFX10W32-NEXT:    s_clause 0x1
290; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
291; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
292; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
293; GFX10W32-NEXT:    ; implicit-def: $vgpr1
294; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
295; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
296; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
297; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
298; GFX10W32-NEXT:  ; %bb.1:
299; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
300; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
301; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
302; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
304; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
305; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
306; GFX10W32-NEXT:  .LBB1_2:
307; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
308; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
309; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
310; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
311; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
312; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1]
313; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
314; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
315; GFX10W32-NEXT:    s_endpgm
316entry:
317  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
318  store i32 %old, i32 addrspace(1)* %out
319  ret void
320}
321
322define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
323; GFX6-LABEL: add_i32_varying_vdata:
324; GFX6:       ; %bb.0: ; %entry
325; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
326; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
327; GFX6-NEXT:    v_mov_b32_e32 v1, 0
328; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
329; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
330; GFX6-NEXT:    s_mov_b32 s3, 0xf000
331; GFX6-NEXT:    s_mov_b32 s2, -1
332; GFX6-NEXT:    s_waitcnt vmcnt(0)
333; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
334; GFX6-NEXT:    s_endpgm
335;
336; GFX8-LABEL: add_i32_varying_vdata:
337; GFX8:       ; %bb.0: ; %entry
338; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
339; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
340; GFX8-NEXT:    v_mov_b32_e32 v1, 0
341; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
342; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
343; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
344; GFX8-NEXT:    v_mov_b32_e32 v2, v0
345; GFX8-NEXT:    s_not_b64 exec, exec
346; GFX8-NEXT:    v_mov_b32_e32 v2, 0
347; GFX8-NEXT:    s_not_b64 exec, exec
348; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
349; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
350; GFX8-NEXT:    s_nop 1
351; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
352; GFX8-NEXT:    s_nop 1
353; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
354; GFX8-NEXT:    s_nop 1
355; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
356; GFX8-NEXT:    s_nop 1
357; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
358; GFX8-NEXT:    s_nop 1
359; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
360; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
361; GFX8-NEXT:    s_nop 0
362; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
363; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
364; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
365; GFX8-NEXT:    ; implicit-def: $vgpr0
366; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
367; GFX8-NEXT:    s_cbranch_execz .LBB2_2
368; GFX8-NEXT:  ; %bb.1:
369; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
370; GFX8-NEXT:    v_mov_b32_e32 v0, s6
371; GFX8-NEXT:    v_mov_b32_e32 v3, 0
372; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
373; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
374; GFX8-NEXT:  .LBB2_2:
375; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
376; GFX8-NEXT:    s_waitcnt vmcnt(0)
377; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
378; GFX8-NEXT:    v_mov_b32_e32 v0, v1
379; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX8-NEXT:    v_mov_b32_e32 v4, s3
381; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
382; GFX8-NEXT:    v_mov_b32_e32 v3, s2
383; GFX8-NEXT:    flat_store_dword v[3:4], v0
384; GFX8-NEXT:    s_endpgm
385;
386; GFX9-LABEL: add_i32_varying_vdata:
387; GFX9:       ; %bb.0: ; %entry
388; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
389; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
390; GFX9-NEXT:    v_mov_b32_e32 v1, 0
391; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
392; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
393; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
394; GFX9-NEXT:    v_mov_b32_e32 v2, v0
395; GFX9-NEXT:    s_not_b64 exec, exec
396; GFX9-NEXT:    v_mov_b32_e32 v2, 0
397; GFX9-NEXT:    s_not_b64 exec, exec
398; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
399; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
400; GFX9-NEXT:    s_nop 1
401; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
402; GFX9-NEXT:    s_nop 1
403; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
404; GFX9-NEXT:    s_nop 1
405; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
406; GFX9-NEXT:    s_nop 1
407; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
408; GFX9-NEXT:    s_nop 1
409; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
410; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
411; GFX9-NEXT:    s_nop 0
412; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
413; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
414; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
415; GFX9-NEXT:    ; implicit-def: $vgpr0
416; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
417; GFX9-NEXT:    s_cbranch_execz .LBB2_2
418; GFX9-NEXT:  ; %bb.1:
419; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
420; GFX9-NEXT:    v_mov_b32_e32 v0, s6
421; GFX9-NEXT:    v_mov_b32_e32 v3, 0
422; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
423; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
424; GFX9-NEXT:  .LBB2_2:
425; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
426; GFX9-NEXT:    s_waitcnt vmcnt(0)
427; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
428; GFX9-NEXT:    v_mov_b32_e32 v0, v1
429; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
430; GFX9-NEXT:    v_mov_b32_e32 v3, 0
431; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
433; GFX9-NEXT:    s_endpgm
434;
435; GFX10W64-LABEL: add_i32_varying_vdata:
436; GFX10W64:       ; %bb.0: ; %entry
437; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
438; GFX10W64-NEXT:    s_not_b64 exec, exec
439; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
440; GFX10W64-NEXT:    s_not_b64 exec, exec
441; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
442; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
443; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
444; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
445; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
446; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
447; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
448; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
449; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
450; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
451; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
452; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
453; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
454; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
455; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
456; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
457; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
458; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
459; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
460; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
461; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
462; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
463; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
464; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
465; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
466; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
467; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
468; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
469; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
470; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
471; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
472; GFX10W64-NEXT:    ; implicit-def: $vgpr0
473; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
474; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
475; GFX10W64-NEXT:  ; %bb.1:
476; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
477; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
478; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
479; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
480; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
481; GFX10W64-NEXT:  .LBB2_2:
482; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
483; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
484; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
485; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
486; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
487; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
488; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
489; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
491; GFX10W64-NEXT:    s_endpgm
492;
493; GFX10W32-LABEL: add_i32_varying_vdata:
494; GFX10W32:       ; %bb.0: ; %entry
495; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
496; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
497; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
498; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
499; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
500; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
501; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
502; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
503; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
505; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
506; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
507; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
508; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
509; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
510; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
511; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
512; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
513; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
514; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
515; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
516; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
517; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
518; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
519; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
520; GFX10W32-NEXT:    ; implicit-def: $vgpr0
521; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
522; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
523; GFX10W32-NEXT:  ; %bb.1:
524; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
525; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
526; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
527; GFX10W32-NEXT:    s_mov_b32 s5, s6
528; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
530; GFX10W32-NEXT:  .LBB2_2:
531; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
532; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
533; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
534; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
535; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
536; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
537; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
538; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
540; GFX10W32-NEXT:    s_endpgm
541entry:
542  %lane = call i32 @llvm.amdgcn.workitem.id.x()
543  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
544  store i32 %old, i32 addrspace(1)* %out
545  ret void
546}
547
548define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
549; GFX6-LABEL: add_i32_varying_vindex:
550; GFX6:       ; %bb.0: ; %entry
551; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
552; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
553; GFX6-NEXT:    v_mov_b32_e32 v1, 1
554; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
556; GFX6-NEXT:    s_mov_b32 s3, 0xf000
557; GFX6-NEXT:    s_mov_b32 s2, -1
558; GFX6-NEXT:    s_waitcnt vmcnt(0)
559; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
560; GFX6-NEXT:    s_endpgm
561;
562; GFX8-LABEL: add_i32_varying_vindex:
563; GFX8:       ; %bb.0: ; %entry
564; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
565; GFX8-NEXT:    v_mov_b32_e32 v2, 1
566; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
567; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
568; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
569; GFX8-NEXT:    v_mov_b32_e32 v0, s0
570; GFX8-NEXT:    v_mov_b32_e32 v1, s1
571; GFX8-NEXT:    s_waitcnt vmcnt(0)
572; GFX8-NEXT:    flat_store_dword v[0:1], v2
573; GFX8-NEXT:    s_endpgm
574;
575; GFX9-LABEL: add_i32_varying_vindex:
576; GFX9:       ; %bb.0: ; %entry
577; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
578; GFX9-NEXT:    v_mov_b32_e32 v1, 1
579; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
580; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
581; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
582; GFX9-NEXT:    v_mov_b32_e32 v0, 0
583; GFX9-NEXT:    s_waitcnt vmcnt(0)
584; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
585; GFX9-NEXT:    s_endpgm
586;
587; GFX10-LABEL: add_i32_varying_vindex:
588; GFX10:       ; %bb.0: ; %entry
589; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
590; GFX10-NEXT:    v_mov_b32_e32 v1, 1
591; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
592; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
594; GFX10-NEXT:    v_mov_b32_e32 v0, 0
595; GFX10-NEXT:    s_waitcnt vmcnt(0)
596; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
597; GFX10-NEXT:    s_endpgm
598entry:
599  %lane = call i32 @llvm.amdgcn.workitem.id.x()
600  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
601  store i32 %old, i32 addrspace(1)* %out
602  ret void
603}
604
605define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
606; GFX6-LABEL: add_i32_varying_offset:
607; GFX6:       ; %bb.0: ; %entry
608; GFX6-NEXT:    v_mov_b32_e32 v1, v0
609; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
610; GFX6-NEXT:    s_mov_b32 s2, 0
611; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
612; GFX6-NEXT:    v_mov_b32_e32 v0, s2
613; GFX6-NEXT:    v_mov_b32_e32 v2, 1
614; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
616; GFX6-NEXT:    s_mov_b32 s3, 0xf000
617; GFX6-NEXT:    s_mov_b32 s2, -1
618; GFX6-NEXT:    s_waitcnt vmcnt(0)
619; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
620; GFX6-NEXT:    s_endpgm
621;
622; GFX8-LABEL: add_i32_varying_offset:
623; GFX8:       ; %bb.0: ; %entry
624; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
625; GFX8-NEXT:    s_mov_b32 s2, 0
626; GFX8-NEXT:    v_mov_b32_e32 v1, v0
627; GFX8-NEXT:    v_mov_b32_e32 v0, s2
628; GFX8-NEXT:    v_mov_b32_e32 v2, 1
629; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
630; GFX8-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
631; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
632; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
633; GFX8-NEXT:    v_mov_b32_e32 v0, s0
634; GFX8-NEXT:    v_mov_b32_e32 v1, s1
635; GFX8-NEXT:    s_waitcnt vmcnt(0)
636; GFX8-NEXT:    flat_store_dword v[0:1], v2
637; GFX8-NEXT:    s_endpgm
638;
639; GFX9-LABEL: add_i32_varying_offset:
640; GFX9:       ; %bb.0: ; %entry
641; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
642; GFX9-NEXT:    s_mov_b32 s2, 0
643; GFX9-NEXT:    v_mov_b32_e32 v1, v0
644; GFX9-NEXT:    v_mov_b32_e32 v0, s2
645; GFX9-NEXT:    v_mov_b32_e32 v2, 1
646; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX9-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
648; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
649; GFX9-NEXT:    v_mov_b32_e32 v0, 0
650; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
651; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
652; GFX9-NEXT:    s_endpgm
653;
654; GFX10-LABEL: add_i32_varying_offset:
655; GFX10:       ; %bb.0: ; %entry
656; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
657; GFX10-NEXT:    s_mov_b32 s2, 0
658; GFX10-NEXT:    v_mov_b32_e32 v1, v0
659; GFX10-NEXT:    v_mov_b32_e32 v0, s2
660; GFX10-NEXT:    v_mov_b32_e32 v2, 1
661; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
662; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
663; GFX10-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
664; GFX10-NEXT:    v_mov_b32_e32 v0, 0
665; GFX10-NEXT:    s_waitcnt vmcnt(0)
666; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
667; GFX10-NEXT:    s_endpgm
668entry:
669  %lane = call i32 @llvm.amdgcn.workitem.id.x()
670  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
671  store i32 %old, i32 addrspace(1)* %out
672  ret void
673}
674
675define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
676; GFX6-LABEL: sub_i32_constant:
677; GFX6:       ; %bb.0: ; %entry
678; GFX6-NEXT:    s_mov_b64 s[6:7], exec
679; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
680; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
681; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
682; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
683; GFX6-NEXT:    ; implicit-def: $vgpr1
684; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
685; GFX6-NEXT:    s_cbranch_execz .LBB5_2
686; GFX6-NEXT:  ; %bb.1:
687; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
688; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
689; GFX6-NEXT:    s_mul_i32 s0, s0, 5
690; GFX6-NEXT:    v_mov_b32_e32 v1, s0
691; GFX6-NEXT:    v_mov_b32_e32 v2, 0
692; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
694; GFX6-NEXT:  .LBB5_2:
695; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
696; GFX6-NEXT:    s_waitcnt vmcnt(0)
697; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
698; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
699; GFX6-NEXT:    s_mov_b32 s7, 0xf000
700; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
701; GFX6-NEXT:    s_mov_b32 s6, -1
702; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
703; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
704; GFX6-NEXT:    s_endpgm
705;
706; GFX8-LABEL: sub_i32_constant:
707; GFX8:       ; %bb.0: ; %entry
708; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
709; GFX8-NEXT:    s_mov_b64 s[6:7], exec
710; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
711; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
712; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
713; GFX8-NEXT:    ; implicit-def: $vgpr1
714; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
715; GFX8-NEXT:    s_cbranch_execz .LBB5_2
716; GFX8-NEXT:  ; %bb.1:
717; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
718; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
719; GFX8-NEXT:    s_mul_i32 s0, s0, 5
720; GFX8-NEXT:    v_mov_b32_e32 v1, s0
721; GFX8-NEXT:    v_mov_b32_e32 v2, 0
722; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
724; GFX8-NEXT:  .LBB5_2:
725; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
726; GFX8-NEXT:    s_waitcnt vmcnt(0)
727; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
728; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
729; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
730; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX8-NEXT:    v_mov_b32_e32 v0, s2
732; GFX8-NEXT:    v_mov_b32_e32 v1, s3
733; GFX8-NEXT:    flat_store_dword v[0:1], v2
734; GFX8-NEXT:    s_endpgm
735;
736; GFX9-LABEL: sub_i32_constant:
737; GFX9:       ; %bb.0: ; %entry
738; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
739; GFX9-NEXT:    s_mov_b64 s[6:7], exec
740; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
741; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
742; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
743; GFX9-NEXT:    ; implicit-def: $vgpr1
744; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
745; GFX9-NEXT:    s_cbranch_execz .LBB5_2
746; GFX9-NEXT:  ; %bb.1:
747; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
748; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
749; GFX9-NEXT:    s_mul_i32 s0, s0, 5
750; GFX9-NEXT:    v_mov_b32_e32 v1, s0
751; GFX9-NEXT:    v_mov_b32_e32 v2, 0
752; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
754; GFX9-NEXT:  .LBB5_2:
755; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
756; GFX9-NEXT:    s_waitcnt vmcnt(0)
757; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
758; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
759; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
760; GFX9-NEXT:    v_mov_b32_e32 v1, 0
761; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
763; GFX9-NEXT:    s_endpgm
764;
765; GFX10W64-LABEL: sub_i32_constant:
766; GFX10W64:       ; %bb.0: ; %entry
767; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
768; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
769; GFX10W64-NEXT:    ; implicit-def: $vgpr1
770; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
771; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
772; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
773; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
774; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
775; GFX10W64-NEXT:  ; %bb.1:
776; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
777; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
778; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
779; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
780; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
781; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
782; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
783; GFX10W64-NEXT:  .LBB5_2:
784; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
785; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
786; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
787; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
788; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
789; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
790; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
791; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
792; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
793; GFX10W64-NEXT:    s_endpgm
794;
795; GFX10W32-LABEL: sub_i32_constant:
796; GFX10W32:       ; %bb.0: ; %entry
797; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
798; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
799; GFX10W32-NEXT:    ; implicit-def: $vgpr1
800; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
801; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
802; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
803; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
804; GFX10W32-NEXT:  ; %bb.1:
805; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
806; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
807; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
808; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
809; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
810; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
811; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
812; GFX10W32-NEXT:  .LBB5_2:
813; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
814; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
815; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
816; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
817; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
818; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
819; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
820; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
822; GFX10W32-NEXT:    s_endpgm
823entry:
824  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
825  store i32 %old, i32 addrspace(1)* %out
826  ret void
827}
828
829define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
830; GFX6-LABEL: sub_i32_uniform:
831; GFX6:       ; %bb.0: ; %entry
832; GFX6-NEXT:    s_mov_b64 s[2:3], exec
833; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
834; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
835; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
836; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
837; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
838; GFX6-NEXT:    ; implicit-def: $vgpr1
839; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
840; GFX6-NEXT:    s_cbranch_execz .LBB6_2
841; GFX6-NEXT:  ; %bb.1:
842; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
843; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
844; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
845; GFX6-NEXT:    s_mul_i32 s0, s8, s0
846; GFX6-NEXT:    v_mov_b32_e32 v1, s0
847; GFX6-NEXT:    v_mov_b32_e32 v2, 0
848; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
849; GFX6-NEXT:  .LBB6_2:
850; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
851; GFX6-NEXT:    s_waitcnt vmcnt(0)
852; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
853; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
855; GFX6-NEXT:    s_mov_b32 s7, 0xf000
856; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
857; GFX6-NEXT:    s_mov_b32 s6, -1
858; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
859; GFX6-NEXT:    s_endpgm
860;
861; GFX8-LABEL: sub_i32_uniform:
862; GFX8:       ; %bb.0: ; %entry
863; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
864; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
865; GFX8-NEXT:    s_mov_b64 s[6:7], exec
866; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
867; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
868; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
869; GFX8-NEXT:    ; implicit-def: $vgpr1
870; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
871; GFX8-NEXT:    s_cbranch_execz .LBB6_2
872; GFX8-NEXT:  ; %bb.1:
873; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
874; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
875; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
876; GFX8-NEXT:    s_mul_i32 s0, s8, s0
877; GFX8-NEXT:    v_mov_b32_e32 v1, s0
878; GFX8-NEXT:    v_mov_b32_e32 v2, 0
879; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
880; GFX8-NEXT:  .LBB6_2:
881; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
882; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
883; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
884; GFX8-NEXT:    s_waitcnt vmcnt(0)
885; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
886; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
887; GFX8-NEXT:    v_mov_b32_e32 v0, s2
888; GFX8-NEXT:    v_mov_b32_e32 v1, s3
889; GFX8-NEXT:    flat_store_dword v[0:1], v2
890; GFX8-NEXT:    s_endpgm
891;
892; GFX9-LABEL: sub_i32_uniform:
893; GFX9:       ; %bb.0: ; %entry
894; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
895; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
896; GFX9-NEXT:    s_mov_b64 s[6:7], exec
897; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
898; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
899; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
900; GFX9-NEXT:    ; implicit-def: $vgpr1
901; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
902; GFX9-NEXT:    s_cbranch_execz .LBB6_2
903; GFX9-NEXT:  ; %bb.1:
904; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
905; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
906; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
907; GFX9-NEXT:    s_mul_i32 s0, s8, s0
908; GFX9-NEXT:    v_mov_b32_e32 v1, s0
909; GFX9-NEXT:    v_mov_b32_e32 v2, 0
910; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
911; GFX9-NEXT:  .LBB6_2:
912; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
913; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
914; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
915; GFX9-NEXT:    s_waitcnt vmcnt(0)
916; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
917; GFX9-NEXT:    v_mov_b32_e32 v1, 0
918; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
919; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
920; GFX9-NEXT:    s_endpgm
921;
922; GFX10W64-LABEL: sub_i32_uniform:
923; GFX10W64:       ; %bb.0: ; %entry
924; GFX10W64-NEXT:    s_clause 0x1
925; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
926; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
927; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
928; GFX10W64-NEXT:    ; implicit-def: $vgpr1
929; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
930; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
931; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
932; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
933; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
934; GFX10W64-NEXT:  ; %bb.1:
935; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
936; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
937; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
938; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
940; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
941; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
942; GFX10W64-NEXT:  .LBB6_2:
943; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
944; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
945; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
947; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
948; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
949; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
950; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
951; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
952; GFX10W64-NEXT:    s_endpgm
953;
954; GFX10W32-LABEL: sub_i32_uniform:
955; GFX10W32:       ; %bb.0: ; %entry
956; GFX10W32-NEXT:    s_clause 0x1
957; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
958; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
959; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
960; GFX10W32-NEXT:    ; implicit-def: $vgpr1
961; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
962; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
963; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
964; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
965; GFX10W32-NEXT:  ; %bb.1:
966; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
967; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
968; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
969; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
970; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
971; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
972; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
973; GFX10W32-NEXT:  .LBB6_2:
974; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
975; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
976; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
977; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
978; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
979; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
980; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
981; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
982; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
983; GFX10W32-NEXT:    s_endpgm
984entry:
985  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
986  store i32 %old, i32 addrspace(1)* %out
987  ret void
988}
989
990define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
991; GFX6-LABEL: sub_i32_varying_vdata:
992; GFX6:       ; %bb.0: ; %entry
993; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
994; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
995; GFX6-NEXT:    v_mov_b32_e32 v1, 0
996; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
997; GFX6-NEXT:    buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc
998; GFX6-NEXT:    s_mov_b32 s3, 0xf000
999; GFX6-NEXT:    s_mov_b32 s2, -1
1000; GFX6-NEXT:    s_waitcnt vmcnt(0)
1001; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1002; GFX6-NEXT:    s_endpgm
1003;
1004; GFX8-LABEL: sub_i32_varying_vdata:
1005; GFX8:       ; %bb.0: ; %entry
1006; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1007; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1008; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1009; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1010; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1011; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1012; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1013; GFX8-NEXT:    s_not_b64 exec, exec
1014; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1015; GFX8-NEXT:    s_not_b64 exec, exec
1016; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1017; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1018; GFX8-NEXT:    s_nop 1
1019; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1020; GFX8-NEXT:    s_nop 1
1021; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1022; GFX8-NEXT:    s_nop 1
1023; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1024; GFX8-NEXT:    s_nop 1
1025; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1026; GFX8-NEXT:    s_nop 1
1027; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1028; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1029; GFX8-NEXT:    s_nop 0
1030; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1031; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1032; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1033; GFX8-NEXT:    ; implicit-def: $vgpr0
1034; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1035; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1036; GFX8-NEXT:  ; %bb.1:
1037; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1038; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1039; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1040; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1041; GFX8-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1042; GFX8-NEXT:  .LBB7_2:
1043; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1044; GFX8-NEXT:    s_waitcnt vmcnt(0)
1045; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1046; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1047; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1048; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1049; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1050; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1051; GFX8-NEXT:    flat_store_dword v[3:4], v0
1052; GFX8-NEXT:    s_endpgm
1053;
1054; GFX9-LABEL: sub_i32_varying_vdata:
1055; GFX9:       ; %bb.0: ; %entry
1056; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1057; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1058; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1059; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1060; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1061; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1062; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1063; GFX9-NEXT:    s_not_b64 exec, exec
1064; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1065; GFX9-NEXT:    s_not_b64 exec, exec
1066; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1067; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1068; GFX9-NEXT:    s_nop 1
1069; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1070; GFX9-NEXT:    s_nop 1
1071; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1072; GFX9-NEXT:    s_nop 1
1073; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1074; GFX9-NEXT:    s_nop 1
1075; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1076; GFX9-NEXT:    s_nop 1
1077; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1078; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1079; GFX9-NEXT:    s_nop 0
1080; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1081; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1082; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1083; GFX9-NEXT:    ; implicit-def: $vgpr0
1084; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1085; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1086; GFX9-NEXT:  ; %bb.1:
1087; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1088; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1089; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1090; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1091; GFX9-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1092; GFX9-NEXT:  .LBB7_2:
1093; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1094; GFX9-NEXT:    s_waitcnt vmcnt(0)
1095; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1096; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1097; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1098; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1099; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1100; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1101; GFX9-NEXT:    s_endpgm
1102;
1103; GFX10W64-LABEL: sub_i32_varying_vdata:
1104; GFX10W64:       ; %bb.0: ; %entry
1105; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1106; GFX10W64-NEXT:    s_not_b64 exec, exec
1107; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1108; GFX10W64-NEXT:    s_not_b64 exec, exec
1109; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1110; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1111; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1112; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1113; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1114; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1115; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1116; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1117; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1118; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1119; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1120; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1121; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1122; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1123; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1124; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1125; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1126; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1127; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1128; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1129; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1130; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1131; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1132; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1133; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1134; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1135; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1136; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1137; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1138; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1139; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1140; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1141; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1142; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1143; GFX10W64-NEXT:  ; %bb.1:
1144; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1145; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1146; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1147; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1148; GFX10W64-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1149; GFX10W64-NEXT:  .LBB7_2:
1150; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1151; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1152; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1153; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1154; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1155; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1156; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1157; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1158; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1159; GFX10W64-NEXT:    s_endpgm
1160;
1161; GFX10W32-LABEL: sub_i32_varying_vdata:
1162; GFX10W32:       ; %bb.0: ; %entry
1163; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1164; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1165; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1166; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1167; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1168; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1169; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1170; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1171; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1172; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1173; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1174; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1175; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1176; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1177; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1178; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1179; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1180; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1181; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1182; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1183; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1184; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1185; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1186; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1187; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1188; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1189; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1190; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1191; GFX10W32-NEXT:  ; %bb.1:
1192; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1193; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1194; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1195; GFX10W32-NEXT:    s_mov_b32 s5, s6
1196; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1197; GFX10W32-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1198; GFX10W32-NEXT:  .LBB7_2:
1199; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1200; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1201; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1202; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1203; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1204; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1205; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1206; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1208; GFX10W32-NEXT:    s_endpgm
1209entry:
1210  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1211  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
1212  store i32 %old, i32 addrspace(1)* %out
1213  ret void
1214}
1215
1216define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
1217; GFX6-LABEL: sub_i32_varying_vindex:
1218; GFX6:       ; %bb.0: ; %entry
1219; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1220; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1221; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1222; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1223; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1224; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1225; GFX6-NEXT:    s_mov_b32 s2, -1
1226; GFX6-NEXT:    s_waitcnt vmcnt(0)
1227; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1228; GFX6-NEXT:    s_endpgm
1229;
1230; GFX8-LABEL: sub_i32_varying_vindex:
1231; GFX8:       ; %bb.0: ; %entry
1232; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1233; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1234; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1235; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
1237; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1238; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1239; GFX8-NEXT:    s_waitcnt vmcnt(0)
1240; GFX8-NEXT:    flat_store_dword v[0:1], v2
1241; GFX8-NEXT:    s_endpgm
1242;
1243; GFX9-LABEL: sub_i32_varying_vindex:
1244; GFX9:       ; %bb.0: ; %entry
1245; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1246; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1247; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1248; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1249; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1250; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1251; GFX9-NEXT:    s_waitcnt vmcnt(0)
1252; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1253; GFX9-NEXT:    s_endpgm
1254;
1255; GFX10-LABEL: sub_i32_varying_vindex:
1256; GFX10:       ; %bb.0: ; %entry
1257; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1258; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1259; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1260; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1261; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1262; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1263; GFX10-NEXT:    s_waitcnt vmcnt(0)
1264; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1265; GFX10-NEXT:    s_endpgm
1266entry:
1267  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1268  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
1269  store i32 %old, i32 addrspace(1)* %out
1270  ret void
1271}
1272
1273define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1274; GFX6-LABEL: sub_i32_varying_offset:
1275; GFX6:       ; %bb.0: ; %entry
1276; GFX6-NEXT:    v_mov_b32_e32 v1, v0
1277; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1278; GFX6-NEXT:    s_mov_b32 s2, 0
1279; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1280; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1281; GFX6-NEXT:    v_mov_b32_e32 v2, 1
1282; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX6-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1284; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1285; GFX6-NEXT:    s_mov_b32 s2, -1
1286; GFX6-NEXT:    s_waitcnt vmcnt(0)
1287; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
1288; GFX6-NEXT:    s_endpgm
1289;
1290; GFX8-LABEL: sub_i32_varying_offset:
1291; GFX8:       ; %bb.0: ; %entry
1292; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1293; GFX8-NEXT:    s_mov_b32 s2, 0
1294; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1295; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1296; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1297; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1298; GFX8-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1299; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1300; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1301; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1302; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1303; GFX8-NEXT:    s_waitcnt vmcnt(0)
1304; GFX8-NEXT:    flat_store_dword v[0:1], v2
1305; GFX8-NEXT:    s_endpgm
1306;
1307; GFX9-LABEL: sub_i32_varying_offset:
1308; GFX9:       ; %bb.0: ; %entry
1309; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1310; GFX9-NEXT:    s_mov_b32 s2, 0
1311; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1312; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1313; GFX9-NEXT:    v_mov_b32_e32 v2, 1
1314; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1315; GFX9-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1316; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1317; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1318; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1319; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1320; GFX9-NEXT:    s_endpgm
1321;
1322; GFX10-LABEL: sub_i32_varying_offset:
1323; GFX10:       ; %bb.0: ; %entry
1324; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1325; GFX10-NEXT:    s_mov_b32 s2, 0
1326; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1327; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1328; GFX10-NEXT:    v_mov_b32_e32 v2, 1
1329; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1330; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX10-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1332; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1333; GFX10-NEXT:    s_waitcnt vmcnt(0)
1334; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
1335; GFX10-NEXT:    s_endpgm
1336entry:
1337  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1338  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
1339  store i32 %old, i32 addrspace(1)* %out
1340  ret void
1341}
1342