1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
12declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
13
14; Show what the atomic optimization pass will do for raw buffers.
15
16define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
17; GFX6-LABEL: add_i32_constant:
18; GFX6:       ; %bb.0: ; %entry
19; GFX6-NEXT:    s_mov_b64 s[2:3], exec
20; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
21; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
22; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
23; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
24; GFX6-NEXT:    ; implicit-def: $vgpr1
25; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
26; GFX6-NEXT:    s_cbranch_execz .LBB0_2
27; GFX6-NEXT:  ; %bb.1:
28; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
29; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
30; GFX6-NEXT:    s_mul_i32 s0, s0, 5
31; GFX6-NEXT:    v_mov_b32_e32 v1, s0
32; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
34; GFX6-NEXT:  .LBB0_2:
35; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
36; GFX6-NEXT:    s_waitcnt vmcnt(0)
37; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
38; GFX6-NEXT:    s_mov_b32 s7, 0xf000
39; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
40; GFX6-NEXT:    s_mov_b32 s6, -1
41; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
42; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
43; GFX6-NEXT:    s_endpgm
44;
45; GFX8-LABEL: add_i32_constant:
46; GFX8:       ; %bb.0: ; %entry
47; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
48; GFX8-NEXT:    s_mov_b64 s[6:7], exec
49; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
50; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
51; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
52; GFX8-NEXT:    ; implicit-def: $vgpr1
53; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
54; GFX8-NEXT:    s_cbranch_execz .LBB0_2
55; GFX8-NEXT:  ; %bb.1:
56; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
57; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
58; GFX8-NEXT:    s_mul_i32 s0, s0, 5
59; GFX8-NEXT:    v_mov_b32_e32 v1, s0
60; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
61; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
62; GFX8-NEXT:  .LBB0_2:
63; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
64; GFX8-NEXT:    s_waitcnt vmcnt(0)
65; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
66; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
67; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX8-NEXT:    v_mov_b32_e32 v0, s2
69; GFX8-NEXT:    v_mov_b32_e32 v1, s3
70; GFX8-NEXT:    flat_store_dword v[0:1], v2
71; GFX8-NEXT:    s_endpgm
72;
73; GFX9-LABEL: add_i32_constant:
74; GFX9:       ; %bb.0: ; %entry
75; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
76; GFX9-NEXT:    s_mov_b64 s[6:7], exec
77; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
78; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
79; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
80; GFX9-NEXT:    ; implicit-def: $vgpr1
81; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
82; GFX9-NEXT:    s_cbranch_execz .LBB0_2
83; GFX9-NEXT:  ; %bb.1:
84; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
85; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
86; GFX9-NEXT:    s_mul_i32 s0, s0, 5
87; GFX9-NEXT:    v_mov_b32_e32 v1, s0
88; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
89; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
90; GFX9-NEXT:  .LBB0_2:
91; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
92; GFX9-NEXT:    s_waitcnt vmcnt(0)
93; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
94; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
95; GFX9-NEXT:    v_mov_b32_e32 v1, 0
96; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
97; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
98; GFX9-NEXT:    s_endpgm
99;
100; GFX10W64-LABEL: add_i32_constant:
101; GFX10W64:       ; %bb.0: ; %entry
102; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
103; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
104; GFX10W64-NEXT:    ; implicit-def: $vgpr1
105; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
106; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
107; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
108; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
109; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
110; GFX10W64-NEXT:  ; %bb.1:
111; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
112; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
113; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
114; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
115; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
116; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
117; GFX10W64-NEXT:  .LBB0_2:
118; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
119; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
120; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
121; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
122; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
123; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
124; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
125; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
126; GFX10W64-NEXT:    s_endpgm
127;
128; GFX10W32-LABEL: add_i32_constant:
129; GFX10W32:       ; %bb.0: ; %entry
130; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
131; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
132; GFX10W32-NEXT:    ; implicit-def: $vgpr1
133; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
134; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
135; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
136; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
137; GFX10W32-NEXT:  ; %bb.1:
138; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
139; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
140; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
141; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
142; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
143; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
144; GFX10W32-NEXT:  .LBB0_2:
145; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
146; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
147; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
148; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
149; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
150; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
151; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
153; GFX10W32-NEXT:    s_endpgm
154;
155; GFX11W64-LABEL: add_i32_constant:
156; GFX11W64:       ; %bb.0: ; %entry
157; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
158; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
159; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
160; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
161; GFX11W64-NEXT:    ; implicit-def: $vgpr1
162; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
163; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
164; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
165; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
166; GFX11W64-NEXT:  ; %bb.1:
167; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
168; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
169; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
170; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
171; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
172; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
173; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
174; GFX11W64-NEXT:  .LBB0_2:
175; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
176; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
177; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
178; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
179; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
180; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
181; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
183; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
184; GFX11W64-NEXT:    s_endpgm
185;
186; GFX11W32-LABEL: add_i32_constant:
187; GFX11W32:       ; %bb.0: ; %entry
188; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
189; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
190; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
191; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
192; GFX11W32-NEXT:    ; implicit-def: $vgpr1
193; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
194; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
195; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
196; GFX11W32-NEXT:  ; %bb.1:
197; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
198; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
199; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
200; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
201; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
202; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
204; GFX11W32-NEXT:  .LBB0_2:
205; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
206; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
207; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
208; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
209; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
210; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
211; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
213; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
214; GFX11W32-NEXT:    s_endpgm
215entry:
216  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
217  store i32 %old, i32 addrspace(1)* %out
218  ret void
219}
220
221define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
222; GFX6-LABEL: add_i32_uniform:
223; GFX6:       ; %bb.0: ; %entry
224; GFX6-NEXT:    s_mov_b64 s[2:3], exec
225; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
226; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
227; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
228; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
229; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
230; GFX6-NEXT:    ; implicit-def: $vgpr1
231; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
232; GFX6-NEXT:    s_cbranch_execz .LBB1_2
233; GFX6-NEXT:  ; %bb.1:
234; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
235; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
236; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX6-NEXT:    s_mul_i32 s0, s8, s0
238; GFX6-NEXT:    v_mov_b32_e32 v1, s0
239; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
240; GFX6-NEXT:  .LBB1_2:
241; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
242; GFX6-NEXT:    s_waitcnt vmcnt(0)
243; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
244; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
245; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
246; GFX6-NEXT:    s_mov_b32 s7, 0xf000
247; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
248; GFX6-NEXT:    s_mov_b32 s6, -1
249; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
250; GFX6-NEXT:    s_endpgm
251;
252; GFX8-LABEL: add_i32_uniform:
253; GFX8:       ; %bb.0: ; %entry
254; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
255; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
256; GFX8-NEXT:    s_mov_b64 s[4:5], exec
257; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
258; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
259; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
260; GFX8-NEXT:    ; implicit-def: $vgpr1
261; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
262; GFX8-NEXT:    s_cbranch_execz .LBB1_2
263; GFX8-NEXT:  ; %bb.1:
264; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
265; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
266; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
267; GFX8-NEXT:    s_mul_i32 s0, s8, s0
268; GFX8-NEXT:    v_mov_b32_e32 v1, s0
269; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
270; GFX8-NEXT:  .LBB1_2:
271; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
272; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
274; GFX8-NEXT:    s_waitcnt vmcnt(0)
275; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
276; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
277; GFX8-NEXT:    v_mov_b32_e32 v0, s2
278; GFX8-NEXT:    v_mov_b32_e32 v1, s3
279; GFX8-NEXT:    flat_store_dword v[0:1], v2
280; GFX8-NEXT:    s_endpgm
281;
282; GFX9-LABEL: add_i32_uniform:
283; GFX9:       ; %bb.0: ; %entry
284; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
285; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
286; GFX9-NEXT:    s_mov_b64 s[4:5], exec
287; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
288; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
289; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
290; GFX9-NEXT:    ; implicit-def: $vgpr1
291; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
292; GFX9-NEXT:    s_cbranch_execz .LBB1_2
293; GFX9-NEXT:  ; %bb.1:
294; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
295; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
297; GFX9-NEXT:    s_mul_i32 s0, s8, s0
298; GFX9-NEXT:    v_mov_b32_e32 v1, s0
299; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
300; GFX9-NEXT:  .LBB1_2:
301; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
302; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
304; GFX9-NEXT:    s_waitcnt vmcnt(0)
305; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
306; GFX9-NEXT:    v_mov_b32_e32 v1, 0
307; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
308; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
309; GFX9-NEXT:    s_endpgm
310;
311; GFX10W64-LABEL: add_i32_uniform:
312; GFX10W64:       ; %bb.0: ; %entry
313; GFX10W64-NEXT:    s_clause 0x1
314; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
315; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
316; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
317; GFX10W64-NEXT:    ; implicit-def: $vgpr1
318; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
319; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
320; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
321; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
322; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
323; GFX10W64-NEXT:  ; %bb.1:
324; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
325; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
326; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
327; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
328; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
329; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
330; GFX10W64-NEXT:  .LBB1_2:
331; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
332; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
333; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
334; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
335; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
337; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
338; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
339; GFX10W64-NEXT:    s_endpgm
340;
341; GFX10W32-LABEL: add_i32_uniform:
342; GFX10W32:       ; %bb.0: ; %entry
343; GFX10W32-NEXT:    s_clause 0x1
344; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
345; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
346; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
347; GFX10W32-NEXT:    ; implicit-def: $vgpr1
348; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
349; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
350; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
351; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
352; GFX10W32-NEXT:  ; %bb.1:
353; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
354; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
355; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
356; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
357; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
358; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
359; GFX10W32-NEXT:  .LBB1_2:
360; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
361; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
362; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
363; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
364; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
365; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
366; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
367; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
368; GFX10W32-NEXT:    s_endpgm
369;
370; GFX11W64-LABEL: add_i32_uniform:
371; GFX11W64:       ; %bb.0: ; %entry
372; GFX11W64-NEXT:    s_clause 0x1
373; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
374; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
375; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
376; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
377; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
378; GFX11W64-NEXT:    ; implicit-def: $vgpr1
379; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
380; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
381; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
382; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
383; GFX11W64-NEXT:  ; %bb.1:
384; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
385; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
386; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
388; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
389; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
390; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
391; GFX11W64-NEXT:  .LBB1_2:
392; GFX11W64-NEXT:    s_or_b64 exec, exec, s[6:7]
393; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
394; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
395; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
396; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
397; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
398; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
399; GFX11W64-NEXT:    global_store_b32 v0, v1, s[2:3]
400; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
401; GFX11W64-NEXT:    s_endpgm
402;
403; GFX11W32-LABEL: add_i32_uniform:
404; GFX11W32:       ; %bb.0: ; %entry
405; GFX11W32-NEXT:    s_clause 0x1
406; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
407; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
408; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
409; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
410; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
411; GFX11W32-NEXT:    ; implicit-def: $vgpr1
412; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
413; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
414; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
415; GFX11W32-NEXT:  ; %bb.1:
416; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
417; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
418; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
420; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
421; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
422; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
423; GFX11W32-NEXT:  .LBB1_2:
424; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
425; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
426; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
427; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
428; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
429; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
430; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
431; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
432; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
433; GFX11W32-NEXT:    s_endpgm
434entry:
435  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
436  store i32 %old, i32 addrspace(1)* %out
437  ret void
438}
439
440define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
441; GFX6-LABEL: add_i32_varying_vdata:
442; GFX6:       ; %bb.0: ; %entry
443; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
444; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
445; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
446; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
447; GFX6-NEXT:    s_mov_b32 s3, 0xf000
448; GFX6-NEXT:    s_mov_b32 s2, -1
449; GFX6-NEXT:    s_waitcnt vmcnt(0)
450; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
451; GFX6-NEXT:    s_endpgm
452;
453; GFX8-LABEL: add_i32_varying_vdata:
454; GFX8:       ; %bb.0: ; %entry
455; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
456; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
457; GFX8-NEXT:    v_mov_b32_e32 v1, 0
458; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
459; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
460; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
461; GFX8-NEXT:    v_mov_b32_e32 v2, v0
462; GFX8-NEXT:    s_not_b64 exec, exec
463; GFX8-NEXT:    v_mov_b32_e32 v2, 0
464; GFX8-NEXT:    s_not_b64 exec, exec
465; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
466; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
467; GFX8-NEXT:    s_nop 1
468; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
469; GFX8-NEXT:    s_nop 1
470; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
471; GFX8-NEXT:    s_nop 1
472; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
473; GFX8-NEXT:    s_nop 1
474; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
475; GFX8-NEXT:    s_nop 1
476; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
477; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
478; GFX8-NEXT:    s_nop 0
479; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
480; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
481; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
482; GFX8-NEXT:    ; implicit-def: $vgpr0
483; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
484; GFX8-NEXT:    s_cbranch_execz .LBB2_2
485; GFX8-NEXT:  ; %bb.1:
486; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
487; GFX8-NEXT:    v_mov_b32_e32 v0, s6
488; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
490; GFX8-NEXT:  .LBB2_2:
491; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
492; GFX8-NEXT:    s_waitcnt vmcnt(0)
493; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
494; GFX8-NEXT:    v_mov_b32_e32 v0, v1
495; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
496; GFX8-NEXT:    v_mov_b32_e32 v4, s3
497; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
498; GFX8-NEXT:    v_mov_b32_e32 v3, s2
499; GFX8-NEXT:    flat_store_dword v[3:4], v0
500; GFX8-NEXT:    s_endpgm
501;
502; GFX9-LABEL: add_i32_varying_vdata:
503; GFX9:       ; %bb.0: ; %entry
504; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
505; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
506; GFX9-NEXT:    v_mov_b32_e32 v1, 0
507; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
508; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
509; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
510; GFX9-NEXT:    v_mov_b32_e32 v2, v0
511; GFX9-NEXT:    s_not_b64 exec, exec
512; GFX9-NEXT:    v_mov_b32_e32 v2, 0
513; GFX9-NEXT:    s_not_b64 exec, exec
514; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
515; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
516; GFX9-NEXT:    s_nop 1
517; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
518; GFX9-NEXT:    s_nop 1
519; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
520; GFX9-NEXT:    s_nop 1
521; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
522; GFX9-NEXT:    s_nop 1
523; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
524; GFX9-NEXT:    s_nop 1
525; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
526; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
527; GFX9-NEXT:    s_nop 0
528; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
529; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
530; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
531; GFX9-NEXT:    ; implicit-def: $vgpr0
532; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
533; GFX9-NEXT:    s_cbranch_execz .LBB2_2
534; GFX9-NEXT:  ; %bb.1:
535; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
536; GFX9-NEXT:    v_mov_b32_e32 v0, s6
537; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
538; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
539; GFX9-NEXT:  .LBB2_2:
540; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
541; GFX9-NEXT:    s_waitcnt vmcnt(0)
542; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
543; GFX9-NEXT:    v_mov_b32_e32 v0, v1
544; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
545; GFX9-NEXT:    v_mov_b32_e32 v3, 0
546; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
547; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
548; GFX9-NEXT:    s_endpgm
549;
550; GFX10W64-LABEL: add_i32_varying_vdata:
551; GFX10W64:       ; %bb.0: ; %entry
552; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
553; GFX10W64-NEXT:    s_not_b64 exec, exec
554; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
555; GFX10W64-NEXT:    s_not_b64 exec, exec
556; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
557; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
558; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
559; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
560; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
561; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
562; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
563; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
564; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
565; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
566; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
567; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
568; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
569; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
570; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
571; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
572; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
573; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
574; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
575; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
576; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
577; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
578; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
579; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
580; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
581; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
582; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
583; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
584; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
585; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
586; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
587; GFX10W64-NEXT:    ; implicit-def: $vgpr0
588; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
589; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
590; GFX10W64-NEXT:  ; %bb.1:
591; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
592; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
593; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
595; GFX10W64-NEXT:  .LBB2_2:
596; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
597; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
598; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
599; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
600; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
601; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
602; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
603; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
604; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
605; GFX10W64-NEXT:    s_endpgm
606;
607; GFX10W32-LABEL: add_i32_varying_vdata:
608; GFX10W32:       ; %bb.0: ; %entry
609; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
610; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
611; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
612; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
613; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
614; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
615; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
616; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
617; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
618; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
619; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
620; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
621; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
622; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
623; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
624; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
625; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
626; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
627; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
628; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
629; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
630; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
631; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
632; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
633; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
634; GFX10W32-NEXT:    ; implicit-def: $vgpr0
635; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
636; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
637; GFX10W32-NEXT:  ; %bb.1:
638; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
639; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
640; GFX10W32-NEXT:    s_mov_b32 s5, s6
641; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
642; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
643; GFX10W32-NEXT:  .LBB2_2:
644; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
645; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
646; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
647; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
648; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
649; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
650; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
651; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
652; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
653; GFX10W32-NEXT:    s_endpgm
654;
655; GFX11W64-LABEL: add_i32_varying_vdata:
656; GFX11W64:       ; %bb.0: ; %entry
657; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
658; GFX11W64-NEXT:    s_not_b64 exec, exec
659; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
660; GFX11W64-NEXT:    s_not_b64 exec, exec
661; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
662; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
663; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
664; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
665; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
666; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
667; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
668; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
670; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
671; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
672; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
673; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
674; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
675; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
676; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
677; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
678; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
679; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
680; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
681; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
682; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
683; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
684; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
685; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
686; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
687; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
688; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
689; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
690; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
691; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
692; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
693; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
694; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
695; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
696; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
697; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
698; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
699; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
700; GFX11W64-NEXT:    ; implicit-def: $vgpr0
701; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
702; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
703; GFX11W64-NEXT:  ; %bb.1:
704; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
705; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
706; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
708; GFX11W64-NEXT:  .LBB2_2:
709; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
710; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
711; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
712; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
713; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
714; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
715; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
716; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
717; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
718; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
719; GFX11W64-NEXT:    s_endpgm
720;
721; GFX11W32-LABEL: add_i32_varying_vdata:
722; GFX11W32:       ; %bb.0: ; %entry
723; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
724; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
725; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
726; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
727; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
728; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
729; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
730; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
731; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
732; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
733; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
734; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
735; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
736; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
737; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
738; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
739; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
740; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
741; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
742; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
743; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
744; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
745; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
746; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
747; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
748; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
749; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
750; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
751; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
752; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
753; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
754; GFX11W32-NEXT:    ; implicit-def: $vgpr0
755; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
756; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
757; GFX11W32-NEXT:  ; %bb.1:
758; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
759; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
760; GFX11W32-NEXT:    s_mov_b32 s5, s6
761; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
762; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
763; GFX11W32-NEXT:  .LBB2_2:
764; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
765; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
766; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
767; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
768; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
769; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
770; GFX11W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
771; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
772; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
773; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
774; GFX11W32-NEXT:    s_endpgm
775entry:
776  %lane = call i32 @llvm.amdgcn.workitem.id.x()
777  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
778  store i32 %old, i32 addrspace(1)* %out
779  ret void
780}
781
782define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
783; GFX6-LABEL: add_i32_varying_offset:
784; GFX6:       ; %bb.0: ; %entry
785; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
786; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
787; GFX6-NEXT:    v_mov_b32_e32 v1, 1
788; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
790; GFX6-NEXT:    s_mov_b32 s3, 0xf000
791; GFX6-NEXT:    s_mov_b32 s2, -1
792; GFX6-NEXT:    s_waitcnt vmcnt(0)
793; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
794; GFX6-NEXT:    s_endpgm
795;
796; GFX8-LABEL: add_i32_varying_offset:
797; GFX8:       ; %bb.0: ; %entry
798; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
799; GFX8-NEXT:    v_mov_b32_e32 v2, 1
800; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
801; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
803; GFX8-NEXT:    v_mov_b32_e32 v0, s0
804; GFX8-NEXT:    v_mov_b32_e32 v1, s1
805; GFX8-NEXT:    s_waitcnt vmcnt(0)
806; GFX8-NEXT:    flat_store_dword v[0:1], v2
807; GFX8-NEXT:    s_endpgm
808;
809; GFX9-LABEL: add_i32_varying_offset:
810; GFX9:       ; %bb.0: ; %entry
811; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
812; GFX9-NEXT:    v_mov_b32_e32 v1, 1
813; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
814; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
816; GFX9-NEXT:    v_mov_b32_e32 v0, 0
817; GFX9-NEXT:    s_waitcnt vmcnt(0)
818; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
819; GFX9-NEXT:    s_endpgm
820;
821; GFX10-LABEL: add_i32_varying_offset:
822; GFX10:       ; %bb.0: ; %entry
823; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
824; GFX10-NEXT:    v_mov_b32_e32 v1, 1
825; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
826; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
827; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
828; GFX10-NEXT:    v_mov_b32_e32 v0, 0
829; GFX10-NEXT:    s_waitcnt vmcnt(0)
830; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
831; GFX10-NEXT:    s_endpgm
832;
833; GFX11-LABEL: add_i32_varying_offset:
834; GFX11:       ; %bb.0: ; %entry
835; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
836; GFX11-NEXT:    v_mov_b32_e32 v1, 1
837; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
838; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
839; GFX11-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
840; GFX11-NEXT:    v_mov_b32_e32 v0, 0
841; GFX11-NEXT:    s_waitcnt vmcnt(0)
842; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
843; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
844; GFX11-NEXT:    s_endpgm
845entry:
846  %lane = call i32 @llvm.amdgcn.workitem.id.x()
847  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
848  store i32 %old, i32 addrspace(1)* %out
849  ret void
850}
851
852define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
853; GFX6-LABEL: sub_i32_constant:
854; GFX6:       ; %bb.0: ; %entry
855; GFX6-NEXT:    s_mov_b64 s[2:3], exec
856; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
857; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
858; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
859; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
860; GFX6-NEXT:    ; implicit-def: $vgpr1
861; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
862; GFX6-NEXT:    s_cbranch_execz .LBB4_2
863; GFX6-NEXT:  ; %bb.1:
864; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
865; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
866; GFX6-NEXT:    s_mul_i32 s0, s0, 5
867; GFX6-NEXT:    v_mov_b32_e32 v1, s0
868; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
869; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
870; GFX6-NEXT:  .LBB4_2:
871; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
872; GFX6-NEXT:    s_waitcnt vmcnt(0)
873; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
874; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
875; GFX6-NEXT:    s_mov_b32 s7, 0xf000
876; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
877; GFX6-NEXT:    s_mov_b32 s6, -1
878; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
880; GFX6-NEXT:    s_endpgm
881;
882; GFX8-LABEL: sub_i32_constant:
883; GFX8:       ; %bb.0: ; %entry
884; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
885; GFX8-NEXT:    s_mov_b64 s[6:7], exec
886; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
887; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
888; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
889; GFX8-NEXT:    ; implicit-def: $vgpr1
890; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
891; GFX8-NEXT:    s_cbranch_execz .LBB4_2
892; GFX8-NEXT:  ; %bb.1:
893; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
894; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
895; GFX8-NEXT:    s_mul_i32 s0, s0, 5
896; GFX8-NEXT:    v_mov_b32_e32 v1, s0
897; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
898; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
899; GFX8-NEXT:  .LBB4_2:
900; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
901; GFX8-NEXT:    s_waitcnt vmcnt(0)
902; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
903; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
904; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
905; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
906; GFX8-NEXT:    v_mov_b32_e32 v0, s2
907; GFX8-NEXT:    v_mov_b32_e32 v1, s3
908; GFX8-NEXT:    flat_store_dword v[0:1], v2
909; GFX8-NEXT:    s_endpgm
910;
911; GFX9-LABEL: sub_i32_constant:
912; GFX9:       ; %bb.0: ; %entry
913; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
914; GFX9-NEXT:    s_mov_b64 s[6:7], exec
915; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
916; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
917; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
918; GFX9-NEXT:    ; implicit-def: $vgpr1
919; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
920; GFX9-NEXT:    s_cbranch_execz .LBB4_2
921; GFX9-NEXT:  ; %bb.1:
922; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
923; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
924; GFX9-NEXT:    s_mul_i32 s0, s0, 5
925; GFX9-NEXT:    v_mov_b32_e32 v1, s0
926; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
928; GFX9-NEXT:  .LBB4_2:
929; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
930; GFX9-NEXT:    s_waitcnt vmcnt(0)
931; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
932; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
933; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
934; GFX9-NEXT:    v_mov_b32_e32 v1, 0
935; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
937; GFX9-NEXT:    s_endpgm
938;
939; GFX10W64-LABEL: sub_i32_constant:
940; GFX10W64:       ; %bb.0: ; %entry
941; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
942; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
943; GFX10W64-NEXT:    ; implicit-def: $vgpr1
944; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
945; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
946; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
947; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
948; GFX10W64-NEXT:    s_cbranch_execz .LBB4_2
949; GFX10W64-NEXT:  ; %bb.1:
950; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
951; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
952; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
953; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
954; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
956; GFX10W64-NEXT:  .LBB4_2:
957; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
958; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
959; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
960; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
961; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
962; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
963; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
964; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
965; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
966; GFX10W64-NEXT:    s_endpgm
967;
968; GFX10W32-LABEL: sub_i32_constant:
969; GFX10W32:       ; %bb.0: ; %entry
970; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
971; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
972; GFX10W32-NEXT:    ; implicit-def: $vgpr1
973; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
974; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
975; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
976; GFX10W32-NEXT:    s_cbranch_execz .LBB4_2
977; GFX10W32-NEXT:  ; %bb.1:
978; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
979; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
980; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
981; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
982; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
983; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
984; GFX10W32-NEXT:  .LBB4_2:
985; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
986; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
987; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
988; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
989; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
990; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
991; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
992; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
994; GFX10W32-NEXT:    s_endpgm
995;
996; GFX11W64-LABEL: sub_i32_constant:
997; GFX11W64:       ; %bb.0: ; %entry
998; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
999; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1000; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1001; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1002; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1003; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1004; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1005; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1006; GFX11W64-NEXT:    s_cbranch_execz .LBB4_2
1007; GFX11W64-NEXT:  ; %bb.1:
1008; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1009; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1010; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1011; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
1012; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1013; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1014; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1015; GFX11W64-NEXT:  .LBB4_2:
1016; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1017; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1018; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1019; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1020; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1021; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1022; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1023; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1024; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1025; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1026; GFX11W64-NEXT:    s_endpgm
1027;
1028; GFX11W32-LABEL: sub_i32_constant:
1029; GFX11W32:       ; %bb.0: ; %entry
1030; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1031; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1032; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
1033; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1034; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1035; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1036; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1037; GFX11W32-NEXT:    s_cbranch_execz .LBB4_2
1038; GFX11W32-NEXT:  ; %bb.1:
1039; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1040; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
1041; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1042; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
1043; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1044; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1046; GFX11W32-NEXT:  .LBB4_2:
1047; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1048; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1049; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1050; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1051; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1052; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1053; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1054; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1056; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1057; GFX11W32-NEXT:    s_endpgm
1058entry:
1059  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
1060  store i32 %old, i32 addrspace(1)* %out
1061  ret void
1062}
1063
1064define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
1065; GFX6-LABEL: sub_i32_uniform:
1066; GFX6:       ; %bb.0: ; %entry
1067; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1068; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1069; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
1070; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1071; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1072; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1073; GFX6-NEXT:    ; implicit-def: $vgpr1
1074; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1075; GFX6-NEXT:    s_cbranch_execz .LBB5_2
1076; GFX6-NEXT:  ; %bb.1:
1077; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
1078; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
1079; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1080; GFX6-NEXT:    s_mul_i32 s0, s8, s0
1081; GFX6-NEXT:    v_mov_b32_e32 v1, s0
1082; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1083; GFX6-NEXT:  .LBB5_2:
1084; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
1085; GFX6-NEXT:    s_waitcnt vmcnt(0)
1086; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
1087; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
1089; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1090; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1091; GFX6-NEXT:    s_mov_b32 s6, -1
1092; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1093; GFX6-NEXT:    s_endpgm
1094;
1095; GFX8-LABEL: sub_i32_uniform:
1096; GFX8:       ; %bb.0: ; %entry
1097; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1098; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
1099; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1100; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1101; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1102; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1103; GFX8-NEXT:    ; implicit-def: $vgpr1
1104; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1105; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1106; GFX8-NEXT:  ; %bb.1:
1107; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1108; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1109; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1110; GFX8-NEXT:    s_mul_i32 s0, s8, s0
1111; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1112; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1113; GFX8-NEXT:  .LBB5_2:
1114; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1115; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1117; GFX8-NEXT:    s_waitcnt vmcnt(0)
1118; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1119; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1120; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1121; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1122; GFX8-NEXT:    flat_store_dword v[0:1], v2
1123; GFX8-NEXT:    s_endpgm
1124;
1125; GFX9-LABEL: sub_i32_uniform:
1126; GFX9:       ; %bb.0: ; %entry
1127; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1128; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
1129; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1130; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1131; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1132; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1133; GFX9-NEXT:    ; implicit-def: $vgpr1
1134; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1135; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1136; GFX9-NEXT:  ; %bb.1:
1137; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1138; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1139; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1140; GFX9-NEXT:    s_mul_i32 s0, s8, s0
1141; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1142; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1143; GFX9-NEXT:  .LBB5_2:
1144; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1145; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1146; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1147; GFX9-NEXT:    s_waitcnt vmcnt(0)
1148; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1149; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1150; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1151; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1152; GFX9-NEXT:    s_endpgm
1153;
1154; GFX10W64-LABEL: sub_i32_uniform:
1155; GFX10W64:       ; %bb.0: ; %entry
1156; GFX10W64-NEXT:    s_clause 0x1
1157; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1158; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
1159; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
1160; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1161; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1162; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1163; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1164; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1165; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1166; GFX10W64-NEXT:  ; %bb.1:
1167; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1168; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1169; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1170; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
1171; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1172; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1173; GFX10W64-NEXT:  .LBB5_2:
1174; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1175; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1176; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1177; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1178; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1179; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1180; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1181; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1182; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1183; GFX10W64-NEXT:    s_endpgm
1184;
1185; GFX10W32-LABEL: sub_i32_uniform:
1186; GFX10W32:       ; %bb.0: ; %entry
1187; GFX10W32-NEXT:    s_clause 0x1
1188; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1189; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
1190; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
1191; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1192; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1193; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1194; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1195; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1196; GFX10W32-NEXT:  ; %bb.1:
1197; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1198; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1199; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
1201; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1202; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1203; GFX10W32-NEXT:  .LBB5_2:
1204; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1205; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1206; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1207; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1208; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1209; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1210; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1211; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1212; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1213; GFX10W32-NEXT:    s_endpgm
1214;
1215; GFX11W64-LABEL: sub_i32_uniform:
1216; GFX11W64:       ; %bb.0: ; %entry
1217; GFX11W64-NEXT:    s_clause 0x1
1218; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1219; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
1220; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1221; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1222; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1223; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1224; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1225; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1226; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1227; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1228; GFX11W64-NEXT:  ; %bb.1:
1229; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
1230; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1231; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1232; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
1233; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1234; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1235; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
1236; GFX11W64-NEXT:  .LBB5_2:
1237; GFX11W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1238; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1239; GFX11W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1240; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1241; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1242; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1243; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1244; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1245; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1246; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1247; GFX11W64-NEXT:    s_endpgm
1248;
1249; GFX11W32-LABEL: sub_i32_uniform:
1250; GFX11W32:       ; %bb.0: ; %entry
1251; GFX11W32-NEXT:    s_clause 0x1
1252; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1253; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
1254; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
1255; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1256; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1257; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1258; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1259; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1260; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1261; GFX11W32-NEXT:  ; %bb.1:
1262; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1263; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1264; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1265; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
1266; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1267; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1268; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1269; GFX11W32-NEXT:  .LBB5_2:
1270; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1271; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1272; GFX11W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1273; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1274; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1275; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1276; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1277; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1278; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1279; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1280; GFX11W32-NEXT:    s_endpgm
1281entry:
1282  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
1283  store i32 %old, i32 addrspace(1)* %out
1284  ret void
1285}
1286
1287define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
1288; GFX6-LABEL: sub_i32_varying_vdata:
1289; GFX6:       ; %bb.0: ; %entry
1290; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1291; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1292; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1294; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1295; GFX6-NEXT:    s_mov_b32 s2, -1
1296; GFX6-NEXT:    s_waitcnt vmcnt(0)
1297; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1298; GFX6-NEXT:    s_endpgm
1299;
1300; GFX8-LABEL: sub_i32_varying_vdata:
1301; GFX8:       ; %bb.0: ; %entry
1302; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1303; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1304; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1305; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1306; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1307; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1308; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1309; GFX8-NEXT:    s_not_b64 exec, exec
1310; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1311; GFX8-NEXT:    s_not_b64 exec, exec
1312; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1313; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1314; GFX8-NEXT:    s_nop 1
1315; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1316; GFX8-NEXT:    s_nop 1
1317; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1318; GFX8-NEXT:    s_nop 1
1319; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1320; GFX8-NEXT:    s_nop 1
1321; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1322; GFX8-NEXT:    s_nop 1
1323; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1324; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1325; GFX8-NEXT:    s_nop 0
1326; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1327; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1328; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1329; GFX8-NEXT:    ; implicit-def: $vgpr0
1330; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1331; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1332; GFX8-NEXT:  ; %bb.1:
1333; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1334; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1335; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1336; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1337; GFX8-NEXT:  .LBB6_2:
1338; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1339; GFX8-NEXT:    s_waitcnt vmcnt(0)
1340; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1341; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1342; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1344; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1345; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1346; GFX8-NEXT:    flat_store_dword v[3:4], v0
1347; GFX8-NEXT:    s_endpgm
1348;
1349; GFX9-LABEL: sub_i32_varying_vdata:
1350; GFX9:       ; %bb.0: ; %entry
1351; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1352; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1353; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1354; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1355; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1356; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1357; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1358; GFX9-NEXT:    s_not_b64 exec, exec
1359; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1360; GFX9-NEXT:    s_not_b64 exec, exec
1361; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1362; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1363; GFX9-NEXT:    s_nop 1
1364; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1365; GFX9-NEXT:    s_nop 1
1366; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1367; GFX9-NEXT:    s_nop 1
1368; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1369; GFX9-NEXT:    s_nop 1
1370; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1371; GFX9-NEXT:    s_nop 1
1372; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1373; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1374; GFX9-NEXT:    s_nop 0
1375; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1376; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1377; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1378; GFX9-NEXT:    ; implicit-def: $vgpr0
1379; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1380; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1381; GFX9-NEXT:  ; %bb.1:
1382; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1383; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1384; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1386; GFX9-NEXT:  .LBB6_2:
1387; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1388; GFX9-NEXT:    s_waitcnt vmcnt(0)
1389; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1390; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1391; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1392; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1393; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1394; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1395; GFX9-NEXT:    s_endpgm
1396;
1397; GFX10W64-LABEL: sub_i32_varying_vdata:
1398; GFX10W64:       ; %bb.0: ; %entry
1399; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1400; GFX10W64-NEXT:    s_not_b64 exec, exec
1401; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1402; GFX10W64-NEXT:    s_not_b64 exec, exec
1403; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1404; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1405; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1406; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1407; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1408; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1409; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1410; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1411; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1412; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1413; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1414; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1415; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1416; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1417; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1418; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1419; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1420; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1421; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1422; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1423; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1424; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1425; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1426; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1427; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1428; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1429; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1430; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1431; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1432; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1433; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1434; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1435; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1436; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1437; GFX10W64-NEXT:  ; %bb.1:
1438; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1439; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1440; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1442; GFX10W64-NEXT:  .LBB6_2:
1443; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1444; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1445; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1446; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1447; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1448; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1449; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1450; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1451; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1452; GFX10W64-NEXT:    s_endpgm
1453;
1454; GFX10W32-LABEL: sub_i32_varying_vdata:
1455; GFX10W32:       ; %bb.0: ; %entry
1456; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1457; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1458; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1459; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1460; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1461; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1462; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1463; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1464; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1465; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1466; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1467; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1468; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1469; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1470; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1471; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1472; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1473; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1474; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1475; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1476; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1477; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1478; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1479; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1480; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1481; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1482; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1483; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1484; GFX10W32-NEXT:  ; %bb.1:
1485; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1486; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1487; GFX10W32-NEXT:    s_mov_b32 s5, s6
1488; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1489; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1490; GFX10W32-NEXT:  .LBB6_2:
1491; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1492; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1493; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1494; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1495; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1496; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1497; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1498; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1499; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1500; GFX10W32-NEXT:    s_endpgm
1501;
1502; GFX11W64-LABEL: sub_i32_varying_vdata:
1503; GFX11W64:       ; %bb.0: ; %entry
1504; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
1505; GFX11W64-NEXT:    s_not_b64 exec, exec
1506; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1507; GFX11W64-NEXT:    s_not_b64 exec, exec
1508; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1509; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1510; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1511; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
1512; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1513; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1514; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1515; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1516; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1517; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
1518; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1519; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1520; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1521; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
1522; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1523; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
1524; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1525; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1526; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
1527; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1528; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
1529; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1530; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1531; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
1532; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
1533; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1534; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1535; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1536; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1537; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
1538; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
1539; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
1540; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1541; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1542; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1543; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1544; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
1545; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1546; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1547; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1548; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1549; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
1550; GFX11W64-NEXT:  ; %bb.1:
1551; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1552; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
1553; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1554; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
1555; GFX11W64-NEXT:  .LBB6_2:
1556; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1557; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1558; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
1559; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
1560; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
1561; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1562; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1563; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1564; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
1565; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1566; GFX11W64-NEXT:    s_endpgm
1567;
1568; GFX11W32-LABEL: sub_i32_varying_vdata:
1569; GFX11W32:       ; %bb.0: ; %entry
1570; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
1571; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1572; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1573; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1574; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
1575; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1576; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1577; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1578; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1579; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1580; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1581; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1582; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
1583; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1584; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
1585; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1586; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1587; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1588; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1589; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
1590; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
1591; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1592; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1593; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
1594; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1595; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1596; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1597; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1598; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
1599; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1600; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1601; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1602; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1603; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
1604; GFX11W32-NEXT:  ; %bb.1:
1605; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1606; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
1607; GFX11W32-NEXT:    s_mov_b32 s5, s6
1608; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1609; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
1610; GFX11W32-NEXT:  .LBB6_2:
1611; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1612; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1613; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
1614; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
1615; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
1616; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1617; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1618; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1619; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
1620; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1621; GFX11W32-NEXT:    s_endpgm
1622entry:
1623  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1624  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1625  store i32 %old, i32 addrspace(1)* %out
1626  ret void
1627}
1628
1629define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1630; GFX6-LABEL: sub_i32_varying_offset:
1631; GFX6:       ; %bb.0: ; %entry
1632; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1633; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1634; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1635; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1636; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1637; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1638; GFX6-NEXT:    s_mov_b32 s2, -1
1639; GFX6-NEXT:    s_waitcnt vmcnt(0)
1640; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1641; GFX6-NEXT:    s_endpgm
1642;
1643; GFX8-LABEL: sub_i32_varying_offset:
1644; GFX8:       ; %bb.0: ; %entry
1645; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1646; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1647; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1648; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1649; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
1650; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1651; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1652; GFX8-NEXT:    s_waitcnt vmcnt(0)
1653; GFX8-NEXT:    flat_store_dword v[0:1], v2
1654; GFX8-NEXT:    s_endpgm
1655;
1656; GFX9-LABEL: sub_i32_varying_offset:
1657; GFX9:       ; %bb.0: ; %entry
1658; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1659; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1660; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1661; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1662; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1663; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1664; GFX9-NEXT:    s_waitcnt vmcnt(0)
1665; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1666; GFX9-NEXT:    s_endpgm
1667;
1668; GFX10-LABEL: sub_i32_varying_offset:
1669; GFX10:       ; %bb.0: ; %entry
1670; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1671; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1672; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1673; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1674; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1675; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1676; GFX10-NEXT:    s_waitcnt vmcnt(0)
1677; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1678; GFX10-NEXT:    s_endpgm
1679;
1680; GFX11-LABEL: sub_i32_varying_offset:
1681; GFX11:       ; %bb.0: ; %entry
1682; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
1683; GFX11-NEXT:    v_mov_b32_e32 v1, 1
1684; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1685; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1686; GFX11-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
1687; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1688; GFX11-NEXT:    s_waitcnt vmcnt(0)
1689; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1690; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1691; GFX11-NEXT:    s_endpgm
1692entry:
1693  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1694  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1695  store i32 %old, i32 addrspace(1)* %out
1696  ret void
1697}
1698