1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32)
10declare i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32, i32)
11
12; Show what the atomic optimization pass will do for struct buffers.
13
14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
15; GFX6-LABEL: add_i32_constant:
16; GFX6:       ; %bb.0: ; %entry
17; GFX6-NEXT:    s_mov_b64 s[6:7], exec
18; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
19; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
20; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
21; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
22; GFX6-NEXT:    ; implicit-def: $vgpr1
23; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
24; GFX6-NEXT:    s_cbranch_execz .LBB0_2
25; GFX6-NEXT:  ; %bb.1:
26; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
27; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
28; GFX6-NEXT:    s_mul_i32 s0, s0, 5
29; GFX6-NEXT:    v_mov_b32_e32 v1, s0
30; GFX6-NEXT:    v_mov_b32_e32 v2, 0
31; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX6-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
33; GFX6-NEXT:  .LBB0_2:
34; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
37; GFX6-NEXT:    s_mov_b32 s7, 0xf000
38; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
39; GFX6-NEXT:    s_mov_b32 s6, -1
40; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
42; GFX6-NEXT:    s_endpgm
43;
44; GFX8-LABEL: add_i32_constant:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
47; GFX8-NEXT:    s_mov_b64 s[6:7], exec
48; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
49; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
50; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
51; GFX8-NEXT:    ; implicit-def: $vgpr1
52; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
53; GFX8-NEXT:    s_cbranch_execz .LBB0_2
54; GFX8-NEXT:  ; %bb.1:
55; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
56; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
57; GFX8-NEXT:    s_mul_i32 s0, s0, 5
58; GFX8-NEXT:    v_mov_b32_e32 v1, s0
59; GFX8-NEXT:    v_mov_b32_e32 v2, 0
60; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX8-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
62; GFX8-NEXT:  .LBB0_2:
63; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
66; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mov_b32_e32 v0, s2
69; GFX8-NEXT:    v_mov_b32_e32 v1, s3
70; GFX8-NEXT:    flat_store_dword v[0:1], v2
71; GFX8-NEXT:    s_endpgm
72;
73; GFX9-LABEL: add_i32_constant:
74; GFX9:       ; %bb.0: ; %entry
75; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
76; GFX9-NEXT:    s_mov_b64 s[6:7], exec
77; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
78; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
79; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
80; GFX9-NEXT:    ; implicit-def: $vgpr1
81; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
82; GFX9-NEXT:    s_cbranch_execz .LBB0_2
83; GFX9-NEXT:  ; %bb.1:
84; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
85; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
86; GFX9-NEXT:    s_mul_i32 s0, s0, 5
87; GFX9-NEXT:    v_mov_b32_e32 v1, s0
88; GFX9-NEXT:    v_mov_b32_e32 v2, 0
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
91; GFX9-NEXT:  .LBB0_2:
92; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
93; GFX9-NEXT:    s_waitcnt vmcnt(0)
94; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
95; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
96; GFX9-NEXT:    v_mov_b32_e32 v1, 0
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
99; GFX9-NEXT:    s_endpgm
100;
101; GFX10W64-LABEL: add_i32_constant:
102; GFX10W64:       ; %bb.0: ; %entry
103; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
105; GFX10W64-NEXT:    ; implicit-def: $vgpr1
106; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
107; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
108; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
109; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
110; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
111; GFX10W64-NEXT:  ; %bb.1:
112; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
113; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
114; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
115; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
116; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
117; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
118; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
119; GFX10W64-NEXT:  .LBB0_2:
120; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
121; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
122; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
123; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
124; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
125; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
126; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
128; GFX10W64-NEXT:    s_endpgm
129;
130; GFX10W32-LABEL: add_i32_constant:
131; GFX10W32:       ; %bb.0: ; %entry
132; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
133; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
134; GFX10W32-NEXT:    ; implicit-def: $vgpr1
135; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
136; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
137; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
138; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
139; GFX10W32-NEXT:  ; %bb.1:
140; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
141; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
142; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
143; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
144; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
145; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
147; GFX10W32-NEXT:  .LBB0_2:
148; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
149; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
150; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
151; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
152; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
153; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
154; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
156; GFX10W32-NEXT:    s_endpgm
157entry:
158  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
159  store i32 %old, i32 addrspace(1)* %out
160  ret void
161}
162
163define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
164; GFX6-LABEL: add_i32_uniform:
165; GFX6:       ; %bb.0: ; %entry
166; GFX6-NEXT:    s_mov_b64 s[2:3], exec
167; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
168; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
169; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
170; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
171; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
172; GFX6-NEXT:    ; implicit-def: $vgpr1
173; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
174; GFX6-NEXT:    s_cbranch_execz .LBB1_2
175; GFX6-NEXT:  ; %bb.1:
176; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
177; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
178; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
179; GFX6-NEXT:    s_mul_i32 s0, s8, s0
180; GFX6-NEXT:    v_mov_b32_e32 v1, s0
181; GFX6-NEXT:    v_mov_b32_e32 v2, 0
182; GFX6-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
183; GFX6-NEXT:  .LBB1_2:
184; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
185; GFX6-NEXT:    s_waitcnt vmcnt(0)
186; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
187; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
188; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
189; GFX6-NEXT:    s_mov_b32 s7, 0xf000
190; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
191; GFX6-NEXT:    s_mov_b32 s6, -1
192; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
193; GFX6-NEXT:    s_endpgm
194;
195; GFX8-LABEL: add_i32_uniform:
196; GFX8:       ; %bb.0: ; %entry
197; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
198; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
199; GFX8-NEXT:    s_mov_b64 s[6:7], exec
200; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
201; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
202; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
203; GFX8-NEXT:    ; implicit-def: $vgpr1
204; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
205; GFX8-NEXT:    s_cbranch_execz .LBB1_2
206; GFX8-NEXT:  ; %bb.1:
207; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
208; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX8-NEXT:    s_mul_i32 s0, s8, s0
211; GFX8-NEXT:    v_mov_b32_e32 v1, s0
212; GFX8-NEXT:    v_mov_b32_e32 v2, 0
213; GFX8-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
214; GFX8-NEXT:  .LBB1_2:
215; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
216; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
217; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
218; GFX8-NEXT:    s_waitcnt vmcnt(0)
219; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
220; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
221; GFX8-NEXT:    v_mov_b32_e32 v0, s2
222; GFX8-NEXT:    v_mov_b32_e32 v1, s3
223; GFX8-NEXT:    flat_store_dword v[0:1], v2
224; GFX8-NEXT:    s_endpgm
225;
226; GFX9-LABEL: add_i32_uniform:
227; GFX9:       ; %bb.0: ; %entry
228; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
229; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
230; GFX9-NEXT:    s_mov_b64 s[6:7], exec
231; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
232; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
233; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
234; GFX9-NEXT:    ; implicit-def: $vgpr1
235; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
236; GFX9-NEXT:    s_cbranch_execz .LBB1_2
237; GFX9-NEXT:  ; %bb.1:
238; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
239; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    s_mul_i32 s0, s8, s0
242; GFX9-NEXT:    v_mov_b32_e32 v1, s0
243; GFX9-NEXT:    v_mov_b32_e32 v2, 0
244; GFX9-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
245; GFX9-NEXT:  .LBB1_2:
246; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
247; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
248; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
249; GFX9-NEXT:    s_waitcnt vmcnt(0)
250; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
251; GFX9-NEXT:    v_mov_b32_e32 v1, 0
252; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
253; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
254; GFX9-NEXT:    s_endpgm
255;
256; GFX10W64-LABEL: add_i32_uniform:
257; GFX10W64:       ; %bb.0: ; %entry
258; GFX10W64-NEXT:    s_clause 0x1
259; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
260; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
261; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
262; GFX10W64-NEXT:    ; implicit-def: $vgpr1
263; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
264; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
265; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
266; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
267; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
268; GFX10W64-NEXT:  ; %bb.1:
269; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
270; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
271; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
272; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
274; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
275; GFX10W64-NEXT:    buffer_atomic_add v1, v2, s[12:15], 0 idxen glc
276; GFX10W64-NEXT:  .LBB1_2:
277; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
278; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
279; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
280; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
281; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
282; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
283; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
284; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
285; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
286; GFX10W64-NEXT:    s_endpgm
287;
288; GFX10W32-LABEL: add_i32_uniform:
289; GFX10W32:       ; %bb.0: ; %entry
290; GFX10W32-NEXT:    s_clause 0x1
291; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
292; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
293; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
294; GFX10W32-NEXT:    ; implicit-def: $vgpr1
295; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
296; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
297; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
298; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
299; GFX10W32-NEXT:  ; %bb.1:
300; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
301; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
302; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
303; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
305; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
306; GFX10W32-NEXT:    buffer_atomic_add v1, v2, s[8:11], 0 idxen glc
307; GFX10W32-NEXT:  .LBB1_2:
308; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
309; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
310; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
311; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
312; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
313; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
314; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
315; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
316; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
317; GFX10W32-NEXT:    s_endpgm
318entry:
319  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
320  store i32 %old, i32 addrspace(1)* %out
321  ret void
322}
323
324define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
325; GFX6-LABEL: add_i32_varying_vdata:
326; GFX6:       ; %bb.0: ; %entry
327; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
328; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
329; GFX6-NEXT:    v_mov_b32_e32 v1, 0
330; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
331; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
332; GFX6-NEXT:    s_mov_b32 s3, 0xf000
333; GFX6-NEXT:    s_mov_b32 s2, -1
334; GFX6-NEXT:    s_waitcnt vmcnt(0)
335; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
336; GFX6-NEXT:    s_endpgm
337;
338; GFX8-LABEL: add_i32_varying_vdata:
339; GFX8:       ; %bb.0: ; %entry
340; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
341; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
342; GFX8-NEXT:    v_mov_b32_e32 v1, 0
343; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
344; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
345; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
346; GFX8-NEXT:    v_mov_b32_e32 v2, v0
347; GFX8-NEXT:    s_not_b64 exec, exec
348; GFX8-NEXT:    v_mov_b32_e32 v2, 0
349; GFX8-NEXT:    s_not_b64 exec, exec
350; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
351; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
352; GFX8-NEXT:    s_nop 1
353; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
354; GFX8-NEXT:    s_nop 1
355; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
356; GFX8-NEXT:    s_nop 1
357; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
358; GFX8-NEXT:    s_nop 1
359; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
360; GFX8-NEXT:    s_nop 1
361; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
362; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
363; GFX8-NEXT:    s_nop 0
364; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
365; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
366; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
367; GFX8-NEXT:    ; implicit-def: $vgpr0
368; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
369; GFX8-NEXT:    s_cbranch_execz .LBB2_2
370; GFX8-NEXT:  ; %bb.1:
371; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
372; GFX8-NEXT:    v_mov_b32_e32 v0, s6
373; GFX8-NEXT:    v_mov_b32_e32 v3, 0
374; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
375; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
376; GFX8-NEXT:  .LBB2_2:
377; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
378; GFX8-NEXT:    s_waitcnt vmcnt(0)
379; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
380; GFX8-NEXT:    v_mov_b32_e32 v0, v1
381; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
382; GFX8-NEXT:    v_mov_b32_e32 v4, s3
383; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
384; GFX8-NEXT:    v_mov_b32_e32 v3, s2
385; GFX8-NEXT:    flat_store_dword v[3:4], v0
386; GFX8-NEXT:    s_endpgm
387;
388; GFX9-LABEL: add_i32_varying_vdata:
389; GFX9:       ; %bb.0: ; %entry
390; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
391; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
392; GFX9-NEXT:    v_mov_b32_e32 v1, 0
393; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
394; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
395; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
396; GFX9-NEXT:    v_mov_b32_e32 v2, v0
397; GFX9-NEXT:    s_not_b64 exec, exec
398; GFX9-NEXT:    v_mov_b32_e32 v2, 0
399; GFX9-NEXT:    s_not_b64 exec, exec
400; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
401; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
402; GFX9-NEXT:    s_nop 1
403; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
404; GFX9-NEXT:    s_nop 1
405; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
406; GFX9-NEXT:    s_nop 1
407; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
408; GFX9-NEXT:    s_nop 1
409; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
410; GFX9-NEXT:    s_nop 1
411; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
412; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
413; GFX9-NEXT:    s_nop 0
414; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
415; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
416; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
417; GFX9-NEXT:    ; implicit-def: $vgpr0
418; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
419; GFX9-NEXT:    s_cbranch_execz .LBB2_2
420; GFX9-NEXT:  ; %bb.1:
421; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
422; GFX9-NEXT:    v_mov_b32_e32 v0, s6
423; GFX9-NEXT:    v_mov_b32_e32 v3, 0
424; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
425; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
426; GFX9-NEXT:  .LBB2_2:
427; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
428; GFX9-NEXT:    s_waitcnt vmcnt(0)
429; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
430; GFX9-NEXT:    v_mov_b32_e32 v0, v1
431; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
432; GFX9-NEXT:    v_mov_b32_e32 v3, 0
433; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
435; GFX9-NEXT:    s_endpgm
436;
437; GFX10W64-LABEL: add_i32_varying_vdata:
438; GFX10W64:       ; %bb.0: ; %entry
439; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
440; GFX10W64-NEXT:    s_not_b64 exec, exec
441; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
442; GFX10W64-NEXT:    s_not_b64 exec, exec
443; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
444; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
445; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
446; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
447; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
448; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
449; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
450; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
451; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
452; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
453; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
454; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
455; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
456; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
457; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
458; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
459; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
460; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
461; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
462; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
463; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
464; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
465; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
466; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
467; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
468; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
469; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
470; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
471; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
472; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
473; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
474; GFX10W64-NEXT:    ; implicit-def: $vgpr0
475; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
476; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
477; GFX10W64-NEXT:  ; %bb.1:
478; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
479; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
480; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
481; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
482; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
483; GFX10W64-NEXT:  .LBB2_2:
484; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
485; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
486; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
487; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
488; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
489; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
490; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
491; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
492; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
493; GFX10W64-NEXT:    s_endpgm
494;
495; GFX10W32-LABEL: add_i32_varying_vdata:
496; GFX10W32:       ; %bb.0: ; %entry
497; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
498; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
499; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
500; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
501; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
502; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
503; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
504; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
505; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
506; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
507; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
508; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
509; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
510; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
511; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
512; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
513; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
514; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
515; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
516; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
517; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
518; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
519; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
520; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
521; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
522; GFX10W32-NEXT:    ; implicit-def: $vgpr0
523; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
524; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
525; GFX10W32-NEXT:  ; %bb.1:
526; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
527; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
528; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
529; GFX10W32-NEXT:    s_mov_b32 s5, s6
530; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
531; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
532; GFX10W32-NEXT:  .LBB2_2:
533; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
534; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
535; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
536; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
537; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
538; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
539; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
540; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
541; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
542; GFX10W32-NEXT:    s_endpgm
543entry:
544  %lane = call i32 @llvm.amdgcn.workitem.id.x()
545  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
546  store i32 %old, i32 addrspace(1)* %out
547  ret void
548}
549
550define amdgpu_kernel void @add_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
551; GFX6-LABEL: add_i32_varying_vindex:
552; GFX6:       ; %bb.0: ; %entry
553; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
554; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
555; GFX6-NEXT:    v_mov_b32_e32 v1, 1
556; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
557; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
558; GFX6-NEXT:    s_mov_b32 s3, 0xf000
559; GFX6-NEXT:    s_mov_b32 s2, -1
560; GFX6-NEXT:    s_waitcnt vmcnt(0)
561; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
562; GFX6-NEXT:    s_endpgm
563;
564; GFX8-LABEL: add_i32_varying_vindex:
565; GFX8:       ; %bb.0: ; %entry
566; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
567; GFX8-NEXT:    v_mov_b32_e32 v2, 1
568; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
569; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
570; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 idxen glc
571; GFX8-NEXT:    v_mov_b32_e32 v0, s0
572; GFX8-NEXT:    v_mov_b32_e32 v1, s1
573; GFX8-NEXT:    s_waitcnt vmcnt(0)
574; GFX8-NEXT:    flat_store_dword v[0:1], v2
575; GFX8-NEXT:    s_endpgm
576;
577; GFX9-LABEL: add_i32_varying_vindex:
578; GFX9:       ; %bb.0: ; %entry
579; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
580; GFX9-NEXT:    v_mov_b32_e32 v1, 1
581; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
582; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
583; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
584; GFX9-NEXT:    v_mov_b32_e32 v0, 0
585; GFX9-NEXT:    s_waitcnt vmcnt(0)
586; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
587; GFX9-NEXT:    s_endpgm
588;
589; GFX10-LABEL: add_i32_varying_vindex:
590; GFX10:       ; %bb.0: ; %entry
591; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
592; GFX10-NEXT:    v_mov_b32_e32 v1, 1
593; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
594; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 idxen glc
596; GFX10-NEXT:    v_mov_b32_e32 v0, 0
597; GFX10-NEXT:    s_waitcnt vmcnt(0)
598; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
599; GFX10-NEXT:    s_endpgm
600entry:
601  %lane = call i32 @llvm.amdgcn.workitem.id.x()
602  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
603  store i32 %old, i32 addrspace(1)* %out
604  ret void
605}
606
607define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
608; GFX6-LABEL: add_i32_varying_offset:
609; GFX6:       ; %bb.0: ; %entry
610; GFX6-NEXT:    v_mov_b32_e32 v1, v0
611; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
612; GFX6-NEXT:    s_mov_b32 s2, 0
613; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
614; GFX6-NEXT:    v_mov_b32_e32 v0, s2
615; GFX6-NEXT:    v_mov_b32_e32 v2, 1
616; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX6-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
618; GFX6-NEXT:    s_mov_b32 s3, 0xf000
619; GFX6-NEXT:    s_mov_b32 s2, -1
620; GFX6-NEXT:    s_waitcnt vmcnt(0)
621; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
622; GFX6-NEXT:    s_endpgm
623;
624; GFX8-LABEL: add_i32_varying_offset:
625; GFX8:       ; %bb.0: ; %entry
626; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
627; GFX8-NEXT:    s_mov_b32 s2, 0
628; GFX8-NEXT:    v_mov_b32_e32 v1, v0
629; GFX8-NEXT:    v_mov_b32_e32 v0, s2
630; GFX8-NEXT:    v_mov_b32_e32 v2, 1
631; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
632; GFX8-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
633; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
634; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX8-NEXT:    v_mov_b32_e32 v0, s0
636; GFX8-NEXT:    v_mov_b32_e32 v1, s1
637; GFX8-NEXT:    s_waitcnt vmcnt(0)
638; GFX8-NEXT:    flat_store_dword v[0:1], v2
639; GFX8-NEXT:    s_endpgm
640;
641; GFX9-LABEL: add_i32_varying_offset:
642; GFX9:       ; %bb.0: ; %entry
643; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
644; GFX9-NEXT:    s_mov_b32 s2, 0
645; GFX9-NEXT:    v_mov_b32_e32 v1, v0
646; GFX9-NEXT:    v_mov_b32_e32 v0, s2
647; GFX9-NEXT:    v_mov_b32_e32 v2, 1
648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX9-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
650; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
651; GFX9-NEXT:    v_mov_b32_e32 v0, 0
652; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
653; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
654; GFX9-NEXT:    s_endpgm
655;
656; GFX10-LABEL: add_i32_varying_offset:
657; GFX10:       ; %bb.0: ; %entry
658; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
659; GFX10-NEXT:    s_mov_b32 s2, 0
660; GFX10-NEXT:    v_mov_b32_e32 v1, v0
661; GFX10-NEXT:    v_mov_b32_e32 v0, s2
662; GFX10-NEXT:    v_mov_b32_e32 v2, 1
663; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
664; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX10-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 idxen offen glc
666; GFX10-NEXT:    v_mov_b32_e32 v0, 0
667; GFX10-NEXT:    s_waitcnt vmcnt(0)
668; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
669; GFX10-NEXT:    s_endpgm
670entry:
671  %lane = call i32 @llvm.amdgcn.workitem.id.x()
672  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
673  store i32 %old, i32 addrspace(1)* %out
674  ret void
675}
676
677define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
678; GFX6-LABEL: sub_i32_constant:
679; GFX6:       ; %bb.0: ; %entry
680; GFX6-NEXT:    s_mov_b64 s[6:7], exec
681; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
682; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s6, 0
683; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s7, v0
684; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
685; GFX6-NEXT:    ; implicit-def: $vgpr1
686; GFX6-NEXT:    s_and_saveexec_b64 s[2:3], vcc
687; GFX6-NEXT:    s_cbranch_execz .LBB5_2
688; GFX6-NEXT:  ; %bb.1:
689; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
690; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
691; GFX6-NEXT:    s_mul_i32 s0, s0, 5
692; GFX6-NEXT:    v_mov_b32_e32 v1, s0
693; GFX6-NEXT:    v_mov_b32_e32 v2, 0
694; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
695; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
696; GFX6-NEXT:  .LBB5_2:
697; GFX6-NEXT:    s_or_b64 exec, exec, s[2:3]
698; GFX6-NEXT:    s_waitcnt vmcnt(0)
699; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
700; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
701; GFX6-NEXT:    s_mov_b32 s7, 0xf000
702; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
703; GFX6-NEXT:    s_mov_b32 s6, -1
704; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
706; GFX6-NEXT:    s_endpgm
707;
708; GFX8-LABEL: sub_i32_constant:
709; GFX8:       ; %bb.0: ; %entry
710; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
711; GFX8-NEXT:    s_mov_b64 s[6:7], exec
712; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
713; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
714; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
715; GFX8-NEXT:    ; implicit-def: $vgpr1
716; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
717; GFX8-NEXT:    s_cbranch_execz .LBB5_2
718; GFX8-NEXT:  ; %bb.1:
719; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
720; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
721; GFX8-NEXT:    s_mul_i32 s0, s0, 5
722; GFX8-NEXT:    v_mov_b32_e32 v1, s0
723; GFX8-NEXT:    v_mov_b32_e32 v2, 0
724; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
725; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
726; GFX8-NEXT:  .LBB5_2:
727; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
728; GFX8-NEXT:    s_waitcnt vmcnt(0)
729; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
730; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
731; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
732; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX8-NEXT:    v_mov_b32_e32 v0, s2
734; GFX8-NEXT:    v_mov_b32_e32 v1, s3
735; GFX8-NEXT:    flat_store_dword v[0:1], v2
736; GFX8-NEXT:    s_endpgm
737;
738; GFX9-LABEL: sub_i32_constant:
739; GFX9:       ; %bb.0: ; %entry
740; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
741; GFX9-NEXT:    s_mov_b64 s[6:7], exec
742; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
743; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
744; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
745; GFX9-NEXT:    ; implicit-def: $vgpr1
746; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
747; GFX9-NEXT:    s_cbranch_execz .LBB5_2
748; GFX9-NEXT:  ; %bb.1:
749; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
750; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
751; GFX9-NEXT:    s_mul_i32 s0, s0, 5
752; GFX9-NEXT:    v_mov_b32_e32 v1, s0
753; GFX9-NEXT:    v_mov_b32_e32 v2, 0
754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
756; GFX9-NEXT:  .LBB5_2:
757; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
758; GFX9-NEXT:    s_waitcnt vmcnt(0)
759; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
760; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
761; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
762; GFX9-NEXT:    v_mov_b32_e32 v1, 0
763; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
765; GFX9-NEXT:    s_endpgm
766;
767; GFX10W64-LABEL: sub_i32_constant:
768; GFX10W64:       ; %bb.0: ; %entry
769; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
770; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
771; GFX10W64-NEXT:    ; implicit-def: $vgpr1
772; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
773; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
774; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
775; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
776; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
777; GFX10W64-NEXT:  ; %bb.1:
778; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
779; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
780; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
781; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
782; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
783; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
784; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
785; GFX10W64-NEXT:  .LBB5_2:
786; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
787; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
788; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
789; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
790; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
791; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
792; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
793; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
794; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
795; GFX10W64-NEXT:    s_endpgm
796;
797; GFX10W32-LABEL: sub_i32_constant:
798; GFX10W32:       ; %bb.0: ; %entry
799; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
800; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
801; GFX10W32-NEXT:    ; implicit-def: $vgpr1
802; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
803; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
804; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
805; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
806; GFX10W32-NEXT:  ; %bb.1:
807; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
808; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
809; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
810; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
811; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
812; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
813; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
814; GFX10W32-NEXT:  .LBB5_2:
815; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
816; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
817; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
818; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
819; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
820; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
821; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
822; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
824; GFX10W32-NEXT:    s_endpgm
825entry:
826  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
827  store i32 %old, i32 addrspace(1)* %out
828  ret void
829}
830
831define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
832; GFX6-LABEL: sub_i32_uniform:
833; GFX6:       ; %bb.0: ; %entry
834; GFX6-NEXT:    s_mov_b64 s[2:3], exec
835; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
836; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
837; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
838; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
839; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
840; GFX6-NEXT:    ; implicit-def: $vgpr1
841; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
842; GFX6-NEXT:    s_cbranch_execz .LBB6_2
843; GFX6-NEXT:  ; %bb.1:
844; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
845; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
846; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX6-NEXT:    s_mul_i32 s0, s8, s0
848; GFX6-NEXT:    v_mov_b32_e32 v1, s0
849; GFX6-NEXT:    v_mov_b32_e32 v2, 0
850; GFX6-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
851; GFX6-NEXT:  .LBB6_2:
852; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
853; GFX6-NEXT:    s_waitcnt vmcnt(0)
854; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
855; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
856; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
857; GFX6-NEXT:    s_mov_b32 s7, 0xf000
858; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
859; GFX6-NEXT:    s_mov_b32 s6, -1
860; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
861; GFX6-NEXT:    s_endpgm
862;
863; GFX8-LABEL: sub_i32_uniform:
864; GFX8:       ; %bb.0: ; %entry
865; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
866; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
867; GFX8-NEXT:    s_mov_b64 s[6:7], exec
868; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
869; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
870; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
871; GFX8-NEXT:    ; implicit-def: $vgpr1
872; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
873; GFX8-NEXT:    s_cbranch_execz .LBB6_2
874; GFX8-NEXT:  ; %bb.1:
875; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
876; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
877; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX8-NEXT:    s_mul_i32 s0, s8, s0
879; GFX8-NEXT:    v_mov_b32_e32 v1, s0
880; GFX8-NEXT:    v_mov_b32_e32 v2, 0
881; GFX8-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
882; GFX8-NEXT:  .LBB6_2:
883; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
884; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
886; GFX8-NEXT:    s_waitcnt vmcnt(0)
887; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
888; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
889; GFX8-NEXT:    v_mov_b32_e32 v0, s2
890; GFX8-NEXT:    v_mov_b32_e32 v1, s3
891; GFX8-NEXT:    flat_store_dword v[0:1], v2
892; GFX8-NEXT:    s_endpgm
893;
894; GFX9-LABEL: sub_i32_uniform:
895; GFX9:       ; %bb.0: ; %entry
896; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
897; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
898; GFX9-NEXT:    s_mov_b64 s[6:7], exec
899; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
900; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
901; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
902; GFX9-NEXT:    ; implicit-def: $vgpr1
903; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
904; GFX9-NEXT:    s_cbranch_execz .LBB6_2
905; GFX9-NEXT:  ; %bb.1:
906; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
907; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
908; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX9-NEXT:    s_mul_i32 s0, s8, s0
910; GFX9-NEXT:    v_mov_b32_e32 v1, s0
911; GFX9-NEXT:    v_mov_b32_e32 v2, 0
912; GFX9-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
913; GFX9-NEXT:  .LBB6_2:
914; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
915; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
917; GFX9-NEXT:    s_waitcnt vmcnt(0)
918; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
919; GFX9-NEXT:    v_mov_b32_e32 v1, 0
920; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
921; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
922; GFX9-NEXT:    s_endpgm
923;
924; GFX10W64-LABEL: sub_i32_uniform:
925; GFX10W64:       ; %bb.0: ; %entry
926; GFX10W64-NEXT:    s_clause 0x1
927; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
928; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
929; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
930; GFX10W64-NEXT:    ; implicit-def: $vgpr1
931; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
932; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
933; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
934; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
935; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
936; GFX10W64-NEXT:  ; %bb.1:
937; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
938; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
939; GFX10W64-NEXT:    v_mov_b32_e32 v2, 0
940; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
941; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
942; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
943; GFX10W64-NEXT:    buffer_atomic_sub v1, v2, s[12:15], 0 idxen glc
944; GFX10W64-NEXT:  .LBB6_2:
945; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
946; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
947; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
948; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
949; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
950; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
951; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
952; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
953; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
954; GFX10W64-NEXT:    s_endpgm
955;
956; GFX10W32-LABEL: sub_i32_uniform:
957; GFX10W32:       ; %bb.0: ; %entry
958; GFX10W32-NEXT:    s_clause 0x1
959; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
960; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
961; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
962; GFX10W32-NEXT:    ; implicit-def: $vgpr1
963; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
964; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
965; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
966; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
967; GFX10W32-NEXT:  ; %bb.1:
968; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
969; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
970; GFX10W32-NEXT:    v_mov_b32_e32 v2, 0
971; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
973; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
974; GFX10W32-NEXT:    buffer_atomic_sub v1, v2, s[8:11], 0 idxen glc
975; GFX10W32-NEXT:  .LBB6_2:
976; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
977; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
978; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
979; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
980; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
981; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
982; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
983; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
984; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
985; GFX10W32-NEXT:    s_endpgm
986entry:
987  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
988  store i32 %old, i32 addrspace(1)* %out
989  ret void
990}
991
992define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
993; GFX6-LABEL: sub_i32_varying_vdata:
994; GFX6:       ; %bb.0: ; %entry
995; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
996; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
997; GFX6-NEXT:    v_mov_b32_e32 v1, 0
998; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX6-NEXT:    buffer_atomic_sub v0, v1, s[4:7], 0 idxen glc
1000; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1001; GFX6-NEXT:    s_mov_b32 s2, -1
1002; GFX6-NEXT:    s_waitcnt vmcnt(0)
1003; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1004; GFX6-NEXT:    s_endpgm
1005;
1006; GFX8-LABEL: sub_i32_varying_vdata:
1007; GFX8:       ; %bb.0: ; %entry
1008; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1009; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1010; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1011; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1012; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1013; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1014; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1015; GFX8-NEXT:    s_not_b64 exec, exec
1016; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1017; GFX8-NEXT:    s_not_b64 exec, exec
1018; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1019; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1020; GFX8-NEXT:    s_nop 1
1021; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1022; GFX8-NEXT:    s_nop 1
1023; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1024; GFX8-NEXT:    s_nop 1
1025; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1026; GFX8-NEXT:    s_nop 1
1027; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1028; GFX8-NEXT:    s_nop 1
1029; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1030; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1031; GFX8-NEXT:    s_nop 0
1032; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1033; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1034; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1035; GFX8-NEXT:    ; implicit-def: $vgpr0
1036; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1037; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1038; GFX8-NEXT:  ; %bb.1:
1039; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1040; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1041; GFX8-NEXT:    v_mov_b32_e32 v3, 0
1042; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1043; GFX8-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1044; GFX8-NEXT:  .LBB7_2:
1045; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1046; GFX8-NEXT:    s_waitcnt vmcnt(0)
1047; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1048; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1049; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1050; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1051; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1052; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1053; GFX8-NEXT:    flat_store_dword v[3:4], v0
1054; GFX8-NEXT:    s_endpgm
1055;
1056; GFX9-LABEL: sub_i32_varying_vdata:
1057; GFX9:       ; %bb.0: ; %entry
1058; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1059; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1060; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1061; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1062; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1063; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1064; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1065; GFX9-NEXT:    s_not_b64 exec, exec
1066; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1067; GFX9-NEXT:    s_not_b64 exec, exec
1068; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1069; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1070; GFX9-NEXT:    s_nop 1
1071; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1072; GFX9-NEXT:    s_nop 1
1073; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1074; GFX9-NEXT:    s_nop 1
1075; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1076; GFX9-NEXT:    s_nop 1
1077; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1078; GFX9-NEXT:    s_nop 1
1079; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1080; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1081; GFX9-NEXT:    s_nop 0
1082; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1083; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1084; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1085; GFX9-NEXT:    ; implicit-def: $vgpr0
1086; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1087; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1088; GFX9-NEXT:  ; %bb.1:
1089; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1090; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1091; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1092; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1093; GFX9-NEXT:    buffer_atomic_sub v0, v3, s[8:11], 0 idxen glc
1094; GFX9-NEXT:  .LBB7_2:
1095; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1096; GFX9-NEXT:    s_waitcnt vmcnt(0)
1097; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1098; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1099; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1100; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1101; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1102; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1103; GFX9-NEXT:    s_endpgm
1104;
1105; GFX10W64-LABEL: sub_i32_varying_vdata:
1106; GFX10W64:       ; %bb.0: ; %entry
1107; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1108; GFX10W64-NEXT:    s_not_b64 exec, exec
1109; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1110; GFX10W64-NEXT:    s_not_b64 exec, exec
1111; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1112; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1113; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1114; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1115; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1116; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1117; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1118; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1119; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1120; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1121; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1122; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1123; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1124; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1125; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1126; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1127; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1128; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1129; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1130; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1131; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1132; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1133; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1134; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1135; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1136; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1137; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1138; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1139; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1140; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1141; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1142; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1143; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1144; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1145; GFX10W64-NEXT:  ; %bb.1:
1146; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1147; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1148; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1149; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX10W64-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1151; GFX10W64-NEXT:  .LBB7_2:
1152; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1153; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1154; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1155; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1156; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1157; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1158; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1159; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1161; GFX10W64-NEXT:    s_endpgm
1162;
1163; GFX10W32-LABEL: sub_i32_varying_vdata:
1164; GFX10W32:       ; %bb.0: ; %entry
1165; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1166; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1167; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1168; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1169; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1170; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1171; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1172; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1173; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1174; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1175; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1176; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1177; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1178; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1179; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1180; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1181; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1182; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1183; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1184; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1185; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1186; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1187; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1188; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1189; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1190; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1191; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1192; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1193; GFX10W32-NEXT:  ; %bb.1:
1194; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1195; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1196; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1197; GFX10W32-NEXT:    s_mov_b32 s5, s6
1198; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1199; GFX10W32-NEXT:    buffer_atomic_sub v0, v4, s[8:11], 0 idxen glc
1200; GFX10W32-NEXT:  .LBB7_2:
1201; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1202; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1203; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1204; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1205; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1206; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1207; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1208; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1209; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1210; GFX10W32-NEXT:    s_endpgm
1211entry:
1212  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1213  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0, i32 0)
1214  store i32 %old, i32 addrspace(1)* %out
1215  ret void
1216}
1217
1218define amdgpu_kernel void @sub_i32_varying_vindex(i32 addrspace(1)* %out, <4 x i32> %inout) {
1219; GFX6-LABEL: sub_i32_varying_vindex:
1220; GFX6:       ; %bb.0: ; %entry
1221; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1222; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1223; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1224; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1225; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1226; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1227; GFX6-NEXT:    s_mov_b32 s2, -1
1228; GFX6-NEXT:    s_waitcnt vmcnt(0)
1229; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1230; GFX6-NEXT:    s_endpgm
1231;
1232; GFX8-LABEL: sub_i32_varying_vindex:
1233; GFX8:       ; %bb.0: ; %entry
1234; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1235; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1236; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1237; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 idxen glc
1239; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1240; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1241; GFX8-NEXT:    s_waitcnt vmcnt(0)
1242; GFX8-NEXT:    flat_store_dword v[0:1], v2
1243; GFX8-NEXT:    s_endpgm
1244;
1245; GFX9-LABEL: sub_i32_varying_vindex:
1246; GFX9:       ; %bb.0: ; %entry
1247; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1248; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1249; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1250; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1251; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1252; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1253; GFX9-NEXT:    s_waitcnt vmcnt(0)
1254; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1255; GFX9-NEXT:    s_endpgm
1256;
1257; GFX10-LABEL: sub_i32_varying_vindex:
1258; GFX10:       ; %bb.0: ; %entry
1259; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1260; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1261; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1262; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1263; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 idxen glc
1264; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1265; GFX10-NEXT:    s_waitcnt vmcnt(0)
1266; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1267; GFX10-NEXT:    s_endpgm
1268entry:
1269  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1270  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0, i32 0)
1271  store i32 %old, i32 addrspace(1)* %out
1272  ret void
1273}
1274
1275define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1276; GFX6-LABEL: sub_i32_varying_offset:
1277; GFX6:       ; %bb.0: ; %entry
1278; GFX6-NEXT:    v_mov_b32_e32 v1, v0
1279; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1280; GFX6-NEXT:    s_mov_b32 s2, 0
1281; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1282; GFX6-NEXT:    v_mov_b32_e32 v0, s2
1283; GFX6-NEXT:    v_mov_b32_e32 v2, 1
1284; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX6-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1286; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1287; GFX6-NEXT:    s_mov_b32 s2, -1
1288; GFX6-NEXT:    s_waitcnt vmcnt(0)
1289; GFX6-NEXT:    buffer_store_dword v2, off, s[0:3], 0
1290; GFX6-NEXT:    s_endpgm
1291;
1292; GFX8-LABEL: sub_i32_varying_offset:
1293; GFX8:       ; %bb.0: ; %entry
1294; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1295; GFX8-NEXT:    s_mov_b32 s2, 0
1296; GFX8-NEXT:    v_mov_b32_e32 v1, v0
1297; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1298; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1299; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1300; GFX8-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1301; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1302; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1303; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1304; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1305; GFX8-NEXT:    s_waitcnt vmcnt(0)
1306; GFX8-NEXT:    flat_store_dword v[0:1], v2
1307; GFX8-NEXT:    s_endpgm
1308;
1309; GFX9-LABEL: sub_i32_varying_offset:
1310; GFX9:       ; %bb.0: ; %entry
1311; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1312; GFX9-NEXT:    s_mov_b32 s2, 0
1313; GFX9-NEXT:    v_mov_b32_e32 v1, v0
1314; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1315; GFX9-NEXT:    v_mov_b32_e32 v2, 1
1316; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1317; GFX9-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1318; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1319; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1320; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1321; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
1322; GFX9-NEXT:    s_endpgm
1323;
1324; GFX10-LABEL: sub_i32_varying_offset:
1325; GFX10:       ; %bb.0: ; %entry
1326; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1327; GFX10-NEXT:    s_mov_b32 s2, 0
1328; GFX10-NEXT:    v_mov_b32_e32 v1, v0
1329; GFX10-NEXT:    v_mov_b32_e32 v0, s2
1330; GFX10-NEXT:    v_mov_b32_e32 v2, 1
1331; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1332; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX10-NEXT:    buffer_atomic_sub v2, v[0:1], s[4:7], 0 idxen offen glc
1334; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1335; GFX10-NEXT:    s_waitcnt vmcnt(0)
1336; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
1337; GFX10-NEXT:    s_endpgm
1338entry:
1339  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1340  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 0, i32 %lane, i32 0, i32 0)
1341  store i32 %old, i32 addrspace(1)* %out
1342  ret void
1343}
1344