1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
10declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
11
12; Show what the atomic optimization pass will do for raw buffers.
13
14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
15; GFX6-LABEL: add_i32_constant:
16; GFX6:       ; %bb.0: ; %entry
17; GFX6-NEXT:    s_mov_b64 s[2:3], exec
18; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
19; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
20; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
21; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
22; GFX6-NEXT:    ; implicit-def: $vgpr1
23; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
24; GFX6-NEXT:    s_cbranch_execz .LBB0_2
25; GFX6-NEXT:  ; %bb.1:
26; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
27; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
28; GFX6-NEXT:    s_mul_i32 s0, s0, 5
29; GFX6-NEXT:    v_mov_b32_e32 v1, s0
30; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
32; GFX6-NEXT:  .LBB0_2:
33; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
34; GFX6-NEXT:    s_waitcnt vmcnt(0)
35; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
36; GFX6-NEXT:    s_mov_b32 s7, 0xf000
37; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
38; GFX6-NEXT:    s_mov_b32 s6, -1
39; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
41; GFX6-NEXT:    s_endpgm
42;
43; GFX8-LABEL: add_i32_constant:
44; GFX8:       ; %bb.0: ; %entry
45; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
46; GFX8-NEXT:    s_mov_b64 s[6:7], exec
47; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
48; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
49; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
50; GFX8-NEXT:    ; implicit-def: $vgpr1
51; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
52; GFX8-NEXT:    s_cbranch_execz .LBB0_2
53; GFX8-NEXT:  ; %bb.1:
54; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
55; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
56; GFX8-NEXT:    s_mul_i32 s0, s0, 5
57; GFX8-NEXT:    v_mov_b32_e32 v1, s0
58; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
60; GFX8-NEXT:  .LBB0_2:
61; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
62; GFX8-NEXT:    s_waitcnt vmcnt(0)
63; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
64; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    v_mov_b32_e32 v0, s2
67; GFX8-NEXT:    v_mov_b32_e32 v1, s3
68; GFX8-NEXT:    flat_store_dword v[0:1], v2
69; GFX8-NEXT:    s_endpgm
70;
71; GFX9-LABEL: add_i32_constant:
72; GFX9:       ; %bb.0: ; %entry
73; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
74; GFX9-NEXT:    s_mov_b64 s[6:7], exec
75; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
76; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
77; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
78; GFX9-NEXT:    ; implicit-def: $vgpr1
79; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
80; GFX9-NEXT:    s_cbranch_execz .LBB0_2
81; GFX9-NEXT:  ; %bb.1:
82; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
83; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
84; GFX9-NEXT:    s_mul_i32 s0, s0, 5
85; GFX9-NEXT:    v_mov_b32_e32 v1, s0
86; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
88; GFX9-NEXT:  .LBB0_2:
89; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
90; GFX9-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
92; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
93; GFX9-NEXT:    v_mov_b32_e32 v1, 0
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
96; GFX9-NEXT:    s_endpgm
97;
98; GFX10W64-LABEL: add_i32_constant:
99; GFX10W64:       ; %bb.0: ; %entry
100; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
101; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
102; GFX10W64-NEXT:    ; implicit-def: $vgpr1
103; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
104; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
105; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
106; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
107; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
108; GFX10W64-NEXT:  ; %bb.1:
109; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
110; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
111; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
112; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
113; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
115; GFX10W64-NEXT:  .LBB0_2:
116; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
117; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
118; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
119; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
120; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
121; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
122; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
124; GFX10W64-NEXT:    s_endpgm
125;
126; GFX10W32-LABEL: add_i32_constant:
127; GFX10W32:       ; %bb.0: ; %entry
128; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
129; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
130; GFX10W32-NEXT:    ; implicit-def: $vgpr1
131; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
132; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
133; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
134; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
135; GFX10W32-NEXT:  ; %bb.1:
136; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
137; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
138; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
139; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
140; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
142; GFX10W32-NEXT:  .LBB0_2:
143; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
144; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
145; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
146; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
147; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
148; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
149; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
151; GFX10W32-NEXT:    s_endpgm
152entry:
153  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
154  store i32 %old, i32 addrspace(1)* %out
155  ret void
156}
157
158define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
159; GFX6-LABEL: add_i32_uniform:
160; GFX6:       ; %bb.0: ; %entry
161; GFX6-NEXT:    s_mov_b64 s[2:3], exec
162; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
163; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
164; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
165; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
166; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
167; GFX6-NEXT:    ; implicit-def: $vgpr1
168; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
169; GFX6-NEXT:    s_cbranch_execz .LBB1_2
170; GFX6-NEXT:  ; %bb.1:
171; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
172; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
173; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX6-NEXT:    s_mul_i32 s0, s8, s0
175; GFX6-NEXT:    v_mov_b32_e32 v1, s0
176; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
177; GFX6-NEXT:  .LBB1_2:
178; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
179; GFX6-NEXT:    s_waitcnt vmcnt(0)
180; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
181; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
183; GFX6-NEXT:    s_mov_b32 s7, 0xf000
184; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
185; GFX6-NEXT:    s_mov_b32 s6, -1
186; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
187; GFX6-NEXT:    s_endpgm
188;
189; GFX8-LABEL: add_i32_uniform:
190; GFX8:       ; %bb.0: ; %entry
191; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
192; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
193; GFX8-NEXT:    s_mov_b64 s[4:5], exec
194; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
195; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
196; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
197; GFX8-NEXT:    ; implicit-def: $vgpr1
198; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
199; GFX8-NEXT:    s_cbranch_execz .LBB1_2
200; GFX8-NEXT:  ; %bb.1:
201; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
202; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
203; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX8-NEXT:    s_mul_i32 s0, s8, s0
205; GFX8-NEXT:    v_mov_b32_e32 v1, s0
206; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
207; GFX8-NEXT:  .LBB1_2:
208; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
211; GFX8-NEXT:    s_waitcnt vmcnt(0)
212; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
213; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
214; GFX8-NEXT:    v_mov_b32_e32 v0, s2
215; GFX8-NEXT:    v_mov_b32_e32 v1, s3
216; GFX8-NEXT:    flat_store_dword v[0:1], v2
217; GFX8-NEXT:    s_endpgm
218;
219; GFX9-LABEL: add_i32_uniform:
220; GFX9:       ; %bb.0: ; %entry
221; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
222; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
223; GFX9-NEXT:    s_mov_b64 s[4:5], exec
224; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
225; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
226; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
227; GFX9-NEXT:    ; implicit-def: $vgpr1
228; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
229; GFX9-NEXT:    s_cbranch_execz .LBB1_2
230; GFX9-NEXT:  ; %bb.1:
231; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
232; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
233; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX9-NEXT:    s_mul_i32 s0, s8, s0
235; GFX9-NEXT:    v_mov_b32_e32 v1, s0
236; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
237; GFX9-NEXT:  .LBB1_2:
238; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
241; GFX9-NEXT:    s_waitcnt vmcnt(0)
242; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
243; GFX9-NEXT:    v_mov_b32_e32 v1, 0
244; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
245; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
246; GFX9-NEXT:    s_endpgm
247;
248; GFX10W64-LABEL: add_i32_uniform:
249; GFX10W64:       ; %bb.0: ; %entry
250; GFX10W64-NEXT:    s_clause 0x1
251; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
252; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
253; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
254; GFX10W64-NEXT:    ; implicit-def: $vgpr1
255; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
256; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
257; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
258; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
259; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
260; GFX10W64-NEXT:  ; %bb.1:
261; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
262; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
263; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
265; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
266; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
267; GFX10W64-NEXT:  .LBB1_2:
268; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
269; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
270; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
271; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
272; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
273; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
274; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
275; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
276; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
277; GFX10W64-NEXT:    s_endpgm
278;
279; GFX10W32-LABEL: add_i32_uniform:
280; GFX10W32:       ; %bb.0: ; %entry
281; GFX10W32-NEXT:    s_clause 0x1
282; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
283; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
284; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
285; GFX10W32-NEXT:    ; implicit-def: $vgpr1
286; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
287; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
288; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
289; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
290; GFX10W32-NEXT:  ; %bb.1:
291; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
292; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
293; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
295; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
296; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
297; GFX10W32-NEXT:  .LBB1_2:
298; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
299; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
300; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
301; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
302; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
303; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
304; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
305; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
306; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
307; GFX10W32-NEXT:    s_endpgm
308entry:
309  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
310  store i32 %old, i32 addrspace(1)* %out
311  ret void
312}
313
314define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
315; GFX6-LABEL: add_i32_varying_vdata:
316; GFX6:       ; %bb.0: ; %entry
317; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
318; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
319; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
320; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
321; GFX6-NEXT:    s_mov_b32 s3, 0xf000
322; GFX6-NEXT:    s_mov_b32 s2, -1
323; GFX6-NEXT:    s_waitcnt vmcnt(0)
324; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
325; GFX6-NEXT:    s_endpgm
326;
327; GFX8-LABEL: add_i32_varying_vdata:
328; GFX8:       ; %bb.0: ; %entry
329; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
330; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
331; GFX8-NEXT:    v_mov_b32_e32 v1, 0
332; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
333; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
334; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
335; GFX8-NEXT:    v_mov_b32_e32 v2, v0
336; GFX8-NEXT:    s_not_b64 exec, exec
337; GFX8-NEXT:    v_mov_b32_e32 v2, 0
338; GFX8-NEXT:    s_not_b64 exec, exec
339; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
340; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
341; GFX8-NEXT:    s_nop 1
342; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
343; GFX8-NEXT:    s_nop 1
344; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
345; GFX8-NEXT:    s_nop 1
346; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
347; GFX8-NEXT:    s_nop 1
348; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
349; GFX8-NEXT:    s_nop 1
350; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
351; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
352; GFX8-NEXT:    s_nop 0
353; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
354; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
355; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
356; GFX8-NEXT:    ; implicit-def: $vgpr0
357; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
358; GFX8-NEXT:    s_cbranch_execz .LBB2_2
359; GFX8-NEXT:  ; %bb.1:
360; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
361; GFX8-NEXT:    v_mov_b32_e32 v0, s6
362; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
364; GFX8-NEXT:  .LBB2_2:
365; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
366; GFX8-NEXT:    s_waitcnt vmcnt(0)
367; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
368; GFX8-NEXT:    v_mov_b32_e32 v0, v1
369; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
370; GFX8-NEXT:    v_mov_b32_e32 v4, s3
371; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
372; GFX8-NEXT:    v_mov_b32_e32 v3, s2
373; GFX8-NEXT:    flat_store_dword v[3:4], v0
374; GFX8-NEXT:    s_endpgm
375;
376; GFX9-LABEL: add_i32_varying_vdata:
377; GFX9:       ; %bb.0: ; %entry
378; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
379; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
380; GFX9-NEXT:    v_mov_b32_e32 v1, 0
381; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
382; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
383; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
384; GFX9-NEXT:    v_mov_b32_e32 v2, v0
385; GFX9-NEXT:    s_not_b64 exec, exec
386; GFX9-NEXT:    v_mov_b32_e32 v2, 0
387; GFX9-NEXT:    s_not_b64 exec, exec
388; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
389; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
390; GFX9-NEXT:    s_nop 1
391; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
392; GFX9-NEXT:    s_nop 1
393; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
394; GFX9-NEXT:    s_nop 1
395; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
396; GFX9-NEXT:    s_nop 1
397; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
398; GFX9-NEXT:    s_nop 1
399; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
400; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
401; GFX9-NEXT:    s_nop 0
402; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
403; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
404; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
405; GFX9-NEXT:    ; implicit-def: $vgpr0
406; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
407; GFX9-NEXT:    s_cbranch_execz .LBB2_2
408; GFX9-NEXT:  ; %bb.1:
409; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
410; GFX9-NEXT:    v_mov_b32_e32 v0, s6
411; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
412; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
413; GFX9-NEXT:  .LBB2_2:
414; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
415; GFX9-NEXT:    s_waitcnt vmcnt(0)
416; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
417; GFX9-NEXT:    v_mov_b32_e32 v0, v1
418; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
419; GFX9-NEXT:    v_mov_b32_e32 v3, 0
420; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
422; GFX9-NEXT:    s_endpgm
423;
424; GFX10W64-LABEL: add_i32_varying_vdata:
425; GFX10W64:       ; %bb.0: ; %entry
426; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
427; GFX10W64-NEXT:    s_not_b64 exec, exec
428; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
429; GFX10W64-NEXT:    s_not_b64 exec, exec
430; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
431; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
432; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
433; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
434; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
435; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
436; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
437; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
438; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
439; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
440; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
441; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
442; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
443; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
444; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
445; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
446; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
447; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
448; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
449; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
450; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
451; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
452; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
453; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
454; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
455; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
456; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
457; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
458; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
459; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
460; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
461; GFX10W64-NEXT:    ; implicit-def: $vgpr0
462; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
463; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
464; GFX10W64-NEXT:  ; %bb.1:
465; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
466; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
467; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
469; GFX10W64-NEXT:  .LBB2_2:
470; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
471; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
472; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
473; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
474; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
475; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
476; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
477; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
478; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
479; GFX10W64-NEXT:    s_endpgm
480;
481; GFX10W32-LABEL: add_i32_varying_vdata:
482; GFX10W32:       ; %bb.0: ; %entry
483; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
484; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
485; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
486; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
487; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
488; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
489; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
490; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
491; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
492; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
493; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
494; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
495; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
496; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
497; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
498; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
499; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
500; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
501; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
502; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
503; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
504; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
505; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
506; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
507; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
508; GFX10W32-NEXT:    ; implicit-def: $vgpr0
509; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
510; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
511; GFX10W32-NEXT:  ; %bb.1:
512; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
513; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
514; GFX10W32-NEXT:    s_mov_b32 s5, s6
515; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
516; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
517; GFX10W32-NEXT:  .LBB2_2:
518; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
519; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
520; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
521; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
522; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
523; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
524; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
525; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
526; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
527; GFX10W32-NEXT:    s_endpgm
528entry:
529  %lane = call i32 @llvm.amdgcn.workitem.id.x()
530  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
531  store i32 %old, i32 addrspace(1)* %out
532  ret void
533}
534
535define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
536; GFX6-LABEL: add_i32_varying_offset:
537; GFX6:       ; %bb.0: ; %entry
538; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
539; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
540; GFX6-NEXT:    v_mov_b32_e32 v1, 1
541; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
542; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
543; GFX6-NEXT:    s_mov_b32 s3, 0xf000
544; GFX6-NEXT:    s_mov_b32 s2, -1
545; GFX6-NEXT:    s_waitcnt vmcnt(0)
546; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
547; GFX6-NEXT:    s_endpgm
548;
549; GFX8-LABEL: add_i32_varying_offset:
550; GFX8:       ; %bb.0: ; %entry
551; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
552; GFX8-NEXT:    v_mov_b32_e32 v2, 1
553; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
554; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
555; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
556; GFX8-NEXT:    v_mov_b32_e32 v0, s0
557; GFX8-NEXT:    v_mov_b32_e32 v1, s1
558; GFX8-NEXT:    s_waitcnt vmcnt(0)
559; GFX8-NEXT:    flat_store_dword v[0:1], v2
560; GFX8-NEXT:    s_endpgm
561;
562; GFX9-LABEL: add_i32_varying_offset:
563; GFX9:       ; %bb.0: ; %entry
564; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
565; GFX9-NEXT:    v_mov_b32_e32 v1, 1
566; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
567; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
568; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
569; GFX9-NEXT:    v_mov_b32_e32 v0, 0
570; GFX9-NEXT:    s_waitcnt vmcnt(0)
571; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
572; GFX9-NEXT:    s_endpgm
573;
574; GFX10-LABEL: add_i32_varying_offset:
575; GFX10:       ; %bb.0: ; %entry
576; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
577; GFX10-NEXT:    v_mov_b32_e32 v1, 1
578; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
579; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
580; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
581; GFX10-NEXT:    v_mov_b32_e32 v0, 0
582; GFX10-NEXT:    s_waitcnt vmcnt(0)
583; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
584; GFX10-NEXT:    s_endpgm
585entry:
586  %lane = call i32 @llvm.amdgcn.workitem.id.x()
587  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
588  store i32 %old, i32 addrspace(1)* %out
589  ret void
590}
591
592define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
593; GFX6-LABEL: sub_i32_constant:
594; GFX6:       ; %bb.0: ; %entry
595; GFX6-NEXT:    s_mov_b64 s[2:3], exec
596; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
597; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
598; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
599; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
600; GFX6-NEXT:    ; implicit-def: $vgpr1
601; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
602; GFX6-NEXT:    s_cbranch_execz .LBB4_2
603; GFX6-NEXT:  ; %bb.1:
604; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
605; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
606; GFX6-NEXT:    s_mul_i32 s0, s0, 5
607; GFX6-NEXT:    v_mov_b32_e32 v1, s0
608; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
609; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
610; GFX6-NEXT:  .LBB4_2:
611; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
612; GFX6-NEXT:    s_waitcnt vmcnt(0)
613; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
614; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
615; GFX6-NEXT:    s_mov_b32 s7, 0xf000
616; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
617; GFX6-NEXT:    s_mov_b32 s6, -1
618; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
619; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
620; GFX6-NEXT:    s_endpgm
621;
622; GFX8-LABEL: sub_i32_constant:
623; GFX8:       ; %bb.0: ; %entry
624; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
625; GFX8-NEXT:    s_mov_b64 s[6:7], exec
626; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
627; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
628; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
629; GFX8-NEXT:    ; implicit-def: $vgpr1
630; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
631; GFX8-NEXT:    s_cbranch_execz .LBB4_2
632; GFX8-NEXT:  ; %bb.1:
633; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
634; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
635; GFX8-NEXT:    s_mul_i32 s0, s0, 5
636; GFX8-NEXT:    v_mov_b32_e32 v1, s0
637; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
638; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
639; GFX8-NEXT:  .LBB4_2:
640; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
641; GFX8-NEXT:    s_waitcnt vmcnt(0)
642; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
643; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
644; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
645; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX8-NEXT:    v_mov_b32_e32 v0, s2
647; GFX8-NEXT:    v_mov_b32_e32 v1, s3
648; GFX8-NEXT:    flat_store_dword v[0:1], v2
649; GFX8-NEXT:    s_endpgm
650;
651; GFX9-LABEL: sub_i32_constant:
652; GFX9:       ; %bb.0: ; %entry
653; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
654; GFX9-NEXT:    s_mov_b64 s[6:7], exec
655; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
656; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
657; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
658; GFX9-NEXT:    ; implicit-def: $vgpr1
659; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
660; GFX9-NEXT:    s_cbranch_execz .LBB4_2
661; GFX9-NEXT:  ; %bb.1:
662; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
663; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
664; GFX9-NEXT:    s_mul_i32 s0, s0, 5
665; GFX9-NEXT:    v_mov_b32_e32 v1, s0
666; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
667; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
668; GFX9-NEXT:  .LBB4_2:
669; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
670; GFX9-NEXT:    s_waitcnt vmcnt(0)
671; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
672; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
673; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
674; GFX9-NEXT:    v_mov_b32_e32 v1, 0
675; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
676; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
677; GFX9-NEXT:    s_endpgm
678;
679; GFX10W64-LABEL: sub_i32_constant:
680; GFX10W64:       ; %bb.0: ; %entry
681; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
682; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
683; GFX10W64-NEXT:    ; implicit-def: $vgpr1
684; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
685; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
686; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
687; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
688; GFX10W64-NEXT:    s_cbranch_execz .LBB4_2
689; GFX10W64-NEXT:  ; %bb.1:
690; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
691; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
692; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
693; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
694; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
695; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
696; GFX10W64-NEXT:  .LBB4_2:
697; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
698; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
699; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
700; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
701; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
702; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
703; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
704; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
705; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
706; GFX10W64-NEXT:    s_endpgm
707;
708; GFX10W32-LABEL: sub_i32_constant:
709; GFX10W32:       ; %bb.0: ; %entry
710; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
711; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
712; GFX10W32-NEXT:    ; implicit-def: $vgpr1
713; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
714; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
715; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
716; GFX10W32-NEXT:    s_cbranch_execz .LBB4_2
717; GFX10W32-NEXT:  ; %bb.1:
718; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
719; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
720; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
721; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
722; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
723; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
724; GFX10W32-NEXT:  .LBB4_2:
725; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
726; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
727; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
728; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
729; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
730; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
731; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
732; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
734; GFX10W32-NEXT:    s_endpgm
735entry:
736  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
737  store i32 %old, i32 addrspace(1)* %out
738  ret void
739}
740
741define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
742; GFX6-LABEL: sub_i32_uniform:
743; GFX6:       ; %bb.0: ; %entry
744; GFX6-NEXT:    s_mov_b64 s[2:3], exec
745; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
746; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
747; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
748; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
749; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
750; GFX6-NEXT:    ; implicit-def: $vgpr1
751; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
752; GFX6-NEXT:    s_cbranch_execz .LBB5_2
753; GFX6-NEXT:  ; %bb.1:
754; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
755; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
756; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX6-NEXT:    s_mul_i32 s0, s8, s0
758; GFX6-NEXT:    v_mov_b32_e32 v1, s0
759; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
760; GFX6-NEXT:  .LBB5_2:
761; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
762; GFX6-NEXT:    s_waitcnt vmcnt(0)
763; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
764; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
766; GFX6-NEXT:    s_mov_b32 s7, 0xf000
767; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
768; GFX6-NEXT:    s_mov_b32 s6, -1
769; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
770; GFX6-NEXT:    s_endpgm
771;
772; GFX8-LABEL: sub_i32_uniform:
773; GFX8:       ; %bb.0: ; %entry
774; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
775; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
776; GFX8-NEXT:    s_mov_b64 s[4:5], exec
777; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
778; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
779; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
780; GFX8-NEXT:    ; implicit-def: $vgpr1
781; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
782; GFX8-NEXT:    s_cbranch_execz .LBB5_2
783; GFX8-NEXT:  ; %bb.1:
784; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
785; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX8-NEXT:    s_mul_i32 s0, s8, s0
788; GFX8-NEXT:    v_mov_b32_e32 v1, s0
789; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
790; GFX8-NEXT:  .LBB5_2:
791; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
792; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
793; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
794; GFX8-NEXT:    s_waitcnt vmcnt(0)
795; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
796; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
797; GFX8-NEXT:    v_mov_b32_e32 v0, s2
798; GFX8-NEXT:    v_mov_b32_e32 v1, s3
799; GFX8-NEXT:    flat_store_dword v[0:1], v2
800; GFX8-NEXT:    s_endpgm
801;
802; GFX9-LABEL: sub_i32_uniform:
803; GFX9:       ; %bb.0: ; %entry
804; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
805; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
806; GFX9-NEXT:    s_mov_b64 s[4:5], exec
807; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
808; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
809; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
810; GFX9-NEXT:    ; implicit-def: $vgpr1
811; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
812; GFX9-NEXT:    s_cbranch_execz .LBB5_2
813; GFX9-NEXT:  ; %bb.1:
814; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
815; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
816; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
817; GFX9-NEXT:    s_mul_i32 s0, s8, s0
818; GFX9-NEXT:    v_mov_b32_e32 v1, s0
819; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
820; GFX9-NEXT:  .LBB5_2:
821; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
822; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
823; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
824; GFX9-NEXT:    s_waitcnt vmcnt(0)
825; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
826; GFX9-NEXT:    v_mov_b32_e32 v1, 0
827; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
828; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
829; GFX9-NEXT:    s_endpgm
830;
831; GFX10W64-LABEL: sub_i32_uniform:
832; GFX10W64:       ; %bb.0: ; %entry
833; GFX10W64-NEXT:    s_clause 0x1
834; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
835; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
836; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
837; GFX10W64-NEXT:    ; implicit-def: $vgpr1
838; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
839; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
840; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
841; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
842; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
843; GFX10W64-NEXT:  ; %bb.1:
844; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
845; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
846; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
847; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
848; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
849; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
850; GFX10W64-NEXT:  .LBB5_2:
851; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
852; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
853; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
854; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
855; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
856; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
857; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
858; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
859; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
860; GFX10W64-NEXT:    s_endpgm
861;
862; GFX10W32-LABEL: sub_i32_uniform:
863; GFX10W32:       ; %bb.0: ; %entry
864; GFX10W32-NEXT:    s_clause 0x1
865; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
866; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
867; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
868; GFX10W32-NEXT:    ; implicit-def: $vgpr1
869; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
870; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
871; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
872; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
873; GFX10W32-NEXT:  ; %bb.1:
874; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
875; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
876; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
878; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
879; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
880; GFX10W32-NEXT:  .LBB5_2:
881; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
882; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
883; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
884; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
885; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
886; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
887; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
888; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
889; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
890; GFX10W32-NEXT:    s_endpgm
891entry:
892  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
893  store i32 %old, i32 addrspace(1)* %out
894  ret void
895}
896
897define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
898; GFX6-LABEL: sub_i32_varying_vdata:
899; GFX6:       ; %bb.0: ; %entry
900; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
901; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
902; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
903; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
904; GFX6-NEXT:    s_mov_b32 s3, 0xf000
905; GFX6-NEXT:    s_mov_b32 s2, -1
906; GFX6-NEXT:    s_waitcnt vmcnt(0)
907; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
908; GFX6-NEXT:    s_endpgm
909;
910; GFX8-LABEL: sub_i32_varying_vdata:
911; GFX8:       ; %bb.0: ; %entry
912; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
913; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
914; GFX8-NEXT:    v_mov_b32_e32 v1, 0
915; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
916; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
917; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
918; GFX8-NEXT:    v_mov_b32_e32 v2, v0
919; GFX8-NEXT:    s_not_b64 exec, exec
920; GFX8-NEXT:    v_mov_b32_e32 v2, 0
921; GFX8-NEXT:    s_not_b64 exec, exec
922; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
923; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
924; GFX8-NEXT:    s_nop 1
925; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
926; GFX8-NEXT:    s_nop 1
927; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
928; GFX8-NEXT:    s_nop 1
929; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
930; GFX8-NEXT:    s_nop 1
931; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
932; GFX8-NEXT:    s_nop 1
933; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
934; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
935; GFX8-NEXT:    s_nop 0
936; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
937; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
938; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
939; GFX8-NEXT:    ; implicit-def: $vgpr0
940; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
941; GFX8-NEXT:    s_cbranch_execz .LBB6_2
942; GFX8-NEXT:  ; %bb.1:
943; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
944; GFX8-NEXT:    v_mov_b32_e32 v0, s6
945; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
947; GFX8-NEXT:  .LBB6_2:
948; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
949; GFX8-NEXT:    s_waitcnt vmcnt(0)
950; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
951; GFX8-NEXT:    v_mov_b32_e32 v0, v1
952; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
953; GFX8-NEXT:    v_mov_b32_e32 v4, s3
954; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
955; GFX8-NEXT:    v_mov_b32_e32 v3, s2
956; GFX8-NEXT:    flat_store_dword v[3:4], v0
957; GFX8-NEXT:    s_endpgm
958;
959; GFX9-LABEL: sub_i32_varying_vdata:
960; GFX9:       ; %bb.0: ; %entry
961; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
962; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
963; GFX9-NEXT:    v_mov_b32_e32 v1, 0
964; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
965; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
966; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
967; GFX9-NEXT:    v_mov_b32_e32 v2, v0
968; GFX9-NEXT:    s_not_b64 exec, exec
969; GFX9-NEXT:    v_mov_b32_e32 v2, 0
970; GFX9-NEXT:    s_not_b64 exec, exec
971; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
972; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
973; GFX9-NEXT:    s_nop 1
974; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
975; GFX9-NEXT:    s_nop 1
976; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
977; GFX9-NEXT:    s_nop 1
978; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
979; GFX9-NEXT:    s_nop 1
980; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
981; GFX9-NEXT:    s_nop 1
982; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
983; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
984; GFX9-NEXT:    s_nop 0
985; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
986; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
987; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
988; GFX9-NEXT:    ; implicit-def: $vgpr0
989; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
990; GFX9-NEXT:    s_cbranch_execz .LBB6_2
991; GFX9-NEXT:  ; %bb.1:
992; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
993; GFX9-NEXT:    v_mov_b32_e32 v0, s6
994; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
995; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
996; GFX9-NEXT:  .LBB6_2:
997; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
998; GFX9-NEXT:    s_waitcnt vmcnt(0)
999; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1000; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1001; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1002; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1003; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1004; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1005; GFX9-NEXT:    s_endpgm
1006;
1007; GFX10W64-LABEL: sub_i32_varying_vdata:
1008; GFX10W64:       ; %bb.0: ; %entry
1009; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1010; GFX10W64-NEXT:    s_not_b64 exec, exec
1011; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1012; GFX10W64-NEXT:    s_not_b64 exec, exec
1013; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1014; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1015; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1016; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1017; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1018; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1019; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1020; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1021; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1022; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1023; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1024; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1025; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1026; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1027; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1028; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1029; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1030; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1031; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1032; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1033; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1034; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1035; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1036; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1037; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1038; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1039; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1040; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1041; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1042; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1043; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1044; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1045; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1046; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1047; GFX10W64-NEXT:  ; %bb.1:
1048; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1049; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1050; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1051; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1052; GFX10W64-NEXT:  .LBB6_2:
1053; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1054; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1055; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1056; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1057; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1058; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1059; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1060; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1061; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1062; GFX10W64-NEXT:    s_endpgm
1063;
1064; GFX10W32-LABEL: sub_i32_varying_vdata:
1065; GFX10W32:       ; %bb.0: ; %entry
1066; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1067; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1068; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1069; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1070; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1071; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1072; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1073; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1074; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1075; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1076; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1077; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1078; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1079; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1080; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1081; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1082; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1083; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1084; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1085; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1086; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1087; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1088; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1089; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1090; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1091; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1092; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1093; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1094; GFX10W32-NEXT:  ; %bb.1:
1095; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1096; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1097; GFX10W32-NEXT:    s_mov_b32 s5, s6
1098; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1100; GFX10W32-NEXT:  .LBB6_2:
1101; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1102; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1103; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1104; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1105; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1106; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1107; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1108; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1110; GFX10W32-NEXT:    s_endpgm
1111entry:
1112  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1113  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1114  store i32 %old, i32 addrspace(1)* %out
1115  ret void
1116}
1117
1118define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1119; GFX6-LABEL: sub_i32_varying_offset:
1120; GFX6:       ; %bb.0: ; %entry
1121; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1122; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1123; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1124; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1125; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1126; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1127; GFX6-NEXT:    s_mov_b32 s2, -1
1128; GFX6-NEXT:    s_waitcnt vmcnt(0)
1129; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1130; GFX6-NEXT:    s_endpgm
1131;
1132; GFX8-LABEL: sub_i32_varying_offset:
1133; GFX8:       ; %bb.0: ; %entry
1134; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1135; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1136; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1137; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1138; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
1139; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1140; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1141; GFX8-NEXT:    s_waitcnt vmcnt(0)
1142; GFX8-NEXT:    flat_store_dword v[0:1], v2
1143; GFX8-NEXT:    s_endpgm
1144;
1145; GFX9-LABEL: sub_i32_varying_offset:
1146; GFX9:       ; %bb.0: ; %entry
1147; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1148; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1149; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1150; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1151; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1152; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1153; GFX9-NEXT:    s_waitcnt vmcnt(0)
1154; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1155; GFX9-NEXT:    s_endpgm
1156;
1157; GFX10-LABEL: sub_i32_varying_offset:
1158; GFX10:       ; %bb.0: ; %entry
1159; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1160; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1161; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1162; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1164; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1165; GFX10-NEXT:    s_waitcnt vmcnt(0)
1166; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1167; GFX10-NEXT:    s_endpgm
1168entry:
1169  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1170  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1171  store i32 %old, i32 addrspace(1)* %out
1172  ret void
1173}
1174