1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W64 %s
8; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11W32 %s
9
10declare i32 @llvm.amdgcn.workitem.id.x()
11declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
12declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg)
13declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg)
14
15; Show what the atomic optimization pass will do for raw buffers.
16
17define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
18; GFX6-LABEL: add_i32_constant:
19; GFX6:       ; %bb.0: ; %entry
20; GFX6-NEXT:    s_mov_b64 s[2:3], exec
21; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
22; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
23; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
24; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
25; GFX6-NEXT:    ; implicit-def: $vgpr1
26; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
27; GFX6-NEXT:    s_cbranch_execz .LBB0_2
28; GFX6-NEXT:  ; %bb.1:
29; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
30; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
31; GFX6-NEXT:    s_mul_i32 s0, s0, 5
32; GFX6-NEXT:    v_mov_b32_e32 v1, s0
33; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
34; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
35; GFX6-NEXT:  .LBB0_2:
36; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
37; GFX6-NEXT:    s_waitcnt vmcnt(0)
38; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
39; GFX6-NEXT:    s_mov_b32 s7, 0xf000
40; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
41; GFX6-NEXT:    s_mov_b32 s6, -1
42; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
43; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
44; GFX6-NEXT:    s_endpgm
45;
46; GFX8-LABEL: add_i32_constant:
47; GFX8:       ; %bb.0: ; %entry
48; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
49; GFX8-NEXT:    s_mov_b64 s[6:7], exec
50; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
51; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
52; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
53; GFX8-NEXT:    ; implicit-def: $vgpr1
54; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
55; GFX8-NEXT:    s_cbranch_execz .LBB0_2
56; GFX8-NEXT:  ; %bb.1:
57; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
58; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
59; GFX8-NEXT:    s_mul_i32 s0, s0, 5
60; GFX8-NEXT:    v_mov_b32_e32 v1, s0
61; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
62; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
63; GFX8-NEXT:  .LBB0_2:
64; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
65; GFX8-NEXT:    s_waitcnt vmcnt(0)
66; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
67; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
68; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX8-NEXT:    v_mov_b32_e32 v0, s2
70; GFX8-NEXT:    v_mov_b32_e32 v1, s3
71; GFX8-NEXT:    flat_store_dword v[0:1], v2
72; GFX8-NEXT:    s_endpgm
73;
74; GFX9-LABEL: add_i32_constant:
75; GFX9:       ; %bb.0: ; %entry
76; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
77; GFX9-NEXT:    s_mov_b64 s[6:7], exec
78; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
79; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
80; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
81; GFX9-NEXT:    ; implicit-def: $vgpr1
82; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
83; GFX9-NEXT:    s_cbranch_execz .LBB0_2
84; GFX9-NEXT:  ; %bb.1:
85; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
86; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
87; GFX9-NEXT:    s_mul_i32 s0, s0, 5
88; GFX9-NEXT:    v_mov_b32_e32 v1, s0
89; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
91; GFX9-NEXT:  .LBB0_2:
92; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
93; GFX9-NEXT:    s_waitcnt vmcnt(0)
94; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
95; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
96; GFX9-NEXT:    v_mov_b32_e32 v1, 0
97; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
98; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
99; GFX9-NEXT:    s_endpgm
100;
101; GFX10W64-LABEL: add_i32_constant:
102; GFX10W64:       ; %bb.0: ; %entry
103; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
104; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
105; GFX10W64-NEXT:    ; implicit-def: $vgpr1
106; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
107; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
108; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
109; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
110; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
111; GFX10W64-NEXT:  ; %bb.1:
112; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
113; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
114; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
115; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
116; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
117; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
118; GFX10W64-NEXT:  .LBB0_2:
119; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
120; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
121; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
122; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
123; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
124; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
125; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
126; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
127; GFX10W64-NEXT:    s_endpgm
128;
129; GFX10W32-LABEL: add_i32_constant:
130; GFX10W32:       ; %bb.0: ; %entry
131; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
132; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
133; GFX10W32-NEXT:    ; implicit-def: $vgpr1
134; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
135; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
136; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
137; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
138; GFX10W32-NEXT:  ; %bb.1:
139; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
140; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
141; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
142; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
143; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
144; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
145; GFX10W32-NEXT:  .LBB0_2:
146; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
147; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
148; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
149; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
150; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
151; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
152; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
154; GFX10W32-NEXT:    s_endpgm
155;
156; GFX11W64-LABEL: add_i32_constant:
157; GFX11W64:       ; %bb.0: ; %entry
158; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
159; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
160; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
161; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
162; GFX11W64-NEXT:    ; implicit-def: $vgpr1
163; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
164; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
165; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
166; GFX11W64-NEXT:    s_cbranch_execz .LBB0_2
167; GFX11W64-NEXT:  ; %bb.1:
168; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
169; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
170; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
171; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
172; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
173; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
175; GFX11W64-NEXT:  .LBB0_2:
176; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
177; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
178; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
179; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
180; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
181; GFX11W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
182; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
184; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
185; GFX11W64-NEXT:    s_endpgm
186;
187; GFX11W32-LABEL: add_i32_constant:
188; GFX11W32:       ; %bb.0: ; %entry
189; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
190; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
191; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
192; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
193; GFX11W32-NEXT:    ; implicit-def: $vgpr1
194; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
195; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
196; GFX11W32-NEXT:    s_cbranch_execz .LBB0_2
197; GFX11W32-NEXT:  ; %bb.1:
198; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
199; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
200; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
201; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
202; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
203; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
205; GFX11W32-NEXT:  .LBB0_2:
206; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
207; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
208; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
209; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
210; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
211; GFX11W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
212; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
214; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
215; GFX11W32-NEXT:    s_endpgm
216entry:
217  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
218  store i32 %old, i32 addrspace(1)* %out
219  ret void
220}
221
222define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
223; GFX6-LABEL: add_i32_uniform:
224; GFX6:       ; %bb.0: ; %entry
225; GFX6-NEXT:    s_mov_b64 s[2:3], exec
226; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
227; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
228; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
229; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
230; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
231; GFX6-NEXT:    ; implicit-def: $vgpr1
232; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
233; GFX6-NEXT:    s_cbranch_execz .LBB1_2
234; GFX6-NEXT:  ; %bb.1:
235; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
236; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
237; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
238; GFX6-NEXT:    s_mul_i32 s0, s8, s0
239; GFX6-NEXT:    v_mov_b32_e32 v1, s0
240; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
241; GFX6-NEXT:  .LBB1_2:
242; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
243; GFX6-NEXT:    s_waitcnt vmcnt(0)
244; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
245; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
246; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
247; GFX6-NEXT:    s_mov_b32 s7, 0xf000
248; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
249; GFX6-NEXT:    s_mov_b32 s6, -1
250; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
251; GFX6-NEXT:    s_endpgm
252;
253; GFX8-LABEL: add_i32_uniform:
254; GFX8:       ; %bb.0: ; %entry
255; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
256; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
257; GFX8-NEXT:    s_mov_b64 s[4:5], exec
258; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
259; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
260; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
261; GFX8-NEXT:    ; implicit-def: $vgpr1
262; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
263; GFX8-NEXT:    s_cbranch_execz .LBB1_2
264; GFX8-NEXT:  ; %bb.1:
265; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
266; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
267; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX8-NEXT:    s_mul_i32 s0, s8, s0
269; GFX8-NEXT:    v_mov_b32_e32 v1, s0
270; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
271; GFX8-NEXT:  .LBB1_2:
272; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
273; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
275; GFX8-NEXT:    s_waitcnt vmcnt(0)
276; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
277; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
278; GFX8-NEXT:    v_mov_b32_e32 v0, s2
279; GFX8-NEXT:    v_mov_b32_e32 v1, s3
280; GFX8-NEXT:    flat_store_dword v[0:1], v2
281; GFX8-NEXT:    s_endpgm
282;
283; GFX9-LABEL: add_i32_uniform:
284; GFX9:       ; %bb.0: ; %entry
285; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
286; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
287; GFX9-NEXT:    s_mov_b64 s[4:5], exec
288; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
289; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
290; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
291; GFX9-NEXT:    ; implicit-def: $vgpr1
292; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
293; GFX9-NEXT:    s_cbranch_execz .LBB1_2
294; GFX9-NEXT:  ; %bb.1:
295; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
296; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
297; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
298; GFX9-NEXT:    s_mul_i32 s0, s8, s0
299; GFX9-NEXT:    v_mov_b32_e32 v1, s0
300; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
301; GFX9-NEXT:  .LBB1_2:
302; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
303; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
304; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
305; GFX9-NEXT:    s_waitcnt vmcnt(0)
306; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
307; GFX9-NEXT:    v_mov_b32_e32 v1, 0
308; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
309; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
310; GFX9-NEXT:    s_endpgm
311;
312; GFX10W64-LABEL: add_i32_uniform:
313; GFX10W64:       ; %bb.0: ; %entry
314; GFX10W64-NEXT:    s_clause 0x1
315; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
316; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
317; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
318; GFX10W64-NEXT:    ; implicit-def: $vgpr1
319; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
320; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
321; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
322; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
323; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
324; GFX10W64-NEXT:  ; %bb.1:
325; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
326; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
327; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
328; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
329; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
330; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
331; GFX10W64-NEXT:  .LBB1_2:
332; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
333; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
334; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
335; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
336; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
337; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v0, s[0:1]
338; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
339; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
340; GFX10W64-NEXT:    s_endpgm
341;
342; GFX10W32-LABEL: add_i32_uniform:
343; GFX10W32:       ; %bb.0: ; %entry
344; GFX10W32-NEXT:    s_clause 0x1
345; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
346; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
347; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
348; GFX10W32-NEXT:    ; implicit-def: $vgpr1
349; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
350; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
351; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
352; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
353; GFX10W32-NEXT:  ; %bb.1:
354; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
355; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
356; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
358; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
359; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
360; GFX10W32-NEXT:  .LBB1_2:
361; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
362; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
363; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
364; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
365; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
366; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], s0, s4, v0, s[0:1]
367; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
368; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
369; GFX10W32-NEXT:    s_endpgm
370;
371; GFX11W64-LABEL: add_i32_uniform:
372; GFX11W64:       ; %bb.0: ; %entry
373; GFX11W64-NEXT:    s_clause 0x1
374; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
375; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
376; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
377; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
378; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
379; GFX11W64-NEXT:    ; implicit-def: $vgpr1
380; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
381; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
382; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
383; GFX11W64-NEXT:    s_cbranch_execz .LBB1_2
384; GFX11W64-NEXT:  ; %bb.1:
385; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
386; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
387; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
389; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
390; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
391; GFX11W64-NEXT:    buffer_atomic_add_u32 v1, off, s[12:15], 0 glc
392; GFX11W64-NEXT:  .LBB1_2:
393; GFX11W64-NEXT:    s_or_b64 exec, exec, s[6:7]
394; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
395; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
396; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
398; GFX11W64-NEXT:    v_mad_u64_u32 v[1:2], null, s8, v0, s[0:1]
399; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
400; GFX11W64-NEXT:    global_store_b32 v0, v1, s[2:3]
401; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
402; GFX11W64-NEXT:    s_endpgm
403;
404; GFX11W32-LABEL: add_i32_uniform:
405; GFX11W32:       ; %bb.0: ; %entry
406; GFX11W32-NEXT:    s_clause 0x1
407; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
408; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
409; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
410; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
411; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
412; GFX11W32-NEXT:    ; implicit-def: $vgpr1
413; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
414; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
415; GFX11W32-NEXT:    s_cbranch_execz .LBB1_2
416; GFX11W32-NEXT:  ; %bb.1:
417; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
418; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
419; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
421; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
422; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
423; GFX11W32-NEXT:    buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
424; GFX11W32-NEXT:  .LBB1_2:
425; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
426; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
427; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
428; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
429; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
430; GFX11W32-NEXT:    v_mad_u64_u32 v[1:2], null, s4, v0, s[0:1]
431; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
432; GFX11W32-NEXT:    global_store_b32 v0, v1, s[2:3]
433; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
434; GFX11W32-NEXT:    s_endpgm
435entry:
436  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
437  store i32 %old, i32 addrspace(1)* %out
438  ret void
439}
440
441define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
442; GFX6-LABEL: add_i32_varying_vdata:
443; GFX6:       ; %bb.0: ; %entry
444; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
445; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
446; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
448; GFX6-NEXT:    s_mov_b32 s3, 0xf000
449; GFX6-NEXT:    s_mov_b32 s2, -1
450; GFX6-NEXT:    s_waitcnt vmcnt(0)
451; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
452; GFX6-NEXT:    s_endpgm
453;
454; GFX8-LABEL: add_i32_varying_vdata:
455; GFX8:       ; %bb.0: ; %entry
456; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
457; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
458; GFX8-NEXT:    v_mov_b32_e32 v1, 0
459; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
460; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
461; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
462; GFX8-NEXT:    v_mov_b32_e32 v2, v0
463; GFX8-NEXT:    s_not_b64 exec, exec
464; GFX8-NEXT:    v_mov_b32_e32 v2, 0
465; GFX8-NEXT:    s_not_b64 exec, exec
466; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
467; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
468; GFX8-NEXT:    s_nop 1
469; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
470; GFX8-NEXT:    s_nop 1
471; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
472; GFX8-NEXT:    s_nop 1
473; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
474; GFX8-NEXT:    s_nop 1
475; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
476; GFX8-NEXT:    s_nop 1
477; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
478; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
479; GFX8-NEXT:    s_nop 0
480; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
481; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
482; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
483; GFX8-NEXT:    ; implicit-def: $vgpr0
484; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
485; GFX8-NEXT:    s_cbranch_execz .LBB2_2
486; GFX8-NEXT:  ; %bb.1:
487; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
488; GFX8-NEXT:    v_mov_b32_e32 v0, s6
489; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
491; GFX8-NEXT:  .LBB2_2:
492; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
493; GFX8-NEXT:    s_waitcnt vmcnt(0)
494; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
495; GFX8-NEXT:    v_mov_b32_e32 v0, v1
496; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
497; GFX8-NEXT:    v_mov_b32_e32 v4, s3
498; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
499; GFX8-NEXT:    v_mov_b32_e32 v3, s2
500; GFX8-NEXT:    flat_store_dword v[3:4], v0
501; GFX8-NEXT:    s_endpgm
502;
503; GFX9-LABEL: add_i32_varying_vdata:
504; GFX9:       ; %bb.0: ; %entry
505; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
506; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
507; GFX9-NEXT:    v_mov_b32_e32 v1, 0
508; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
509; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
510; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
511; GFX9-NEXT:    v_mov_b32_e32 v2, v0
512; GFX9-NEXT:    s_not_b64 exec, exec
513; GFX9-NEXT:    v_mov_b32_e32 v2, 0
514; GFX9-NEXT:    s_not_b64 exec, exec
515; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
516; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
517; GFX9-NEXT:    s_nop 1
518; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
519; GFX9-NEXT:    s_nop 1
520; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
521; GFX9-NEXT:    s_nop 1
522; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
523; GFX9-NEXT:    s_nop 1
524; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
525; GFX9-NEXT:    s_nop 1
526; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
527; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
528; GFX9-NEXT:    s_nop 0
529; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
530; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
531; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
532; GFX9-NEXT:    ; implicit-def: $vgpr0
533; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
534; GFX9-NEXT:    s_cbranch_execz .LBB2_2
535; GFX9-NEXT:  ; %bb.1:
536; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
537; GFX9-NEXT:    v_mov_b32_e32 v0, s6
538; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
539; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
540; GFX9-NEXT:  .LBB2_2:
541; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
542; GFX9-NEXT:    s_waitcnt vmcnt(0)
543; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
544; GFX9-NEXT:    v_mov_b32_e32 v0, v1
545; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
546; GFX9-NEXT:    v_mov_b32_e32 v3, 0
547; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
548; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
549; GFX9-NEXT:    s_endpgm
550;
551; GFX10W64-LABEL: add_i32_varying_vdata:
552; GFX10W64:       ; %bb.0: ; %entry
553; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
554; GFX10W64-NEXT:    s_not_b64 exec, exec
555; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
556; GFX10W64-NEXT:    s_not_b64 exec, exec
557; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
558; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
559; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
560; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
561; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
562; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
563; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
564; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
565; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
566; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
567; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
568; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
569; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
570; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
571; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
572; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
573; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
574; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
575; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
576; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
577; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
578; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
579; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
580; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
581; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
582; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
583; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
584; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
585; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
586; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
587; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
588; GFX10W64-NEXT:    ; implicit-def: $vgpr0
589; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
590; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
591; GFX10W64-NEXT:  ; %bb.1:
592; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
593; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
594; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
596; GFX10W64-NEXT:  .LBB2_2:
597; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
598; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
599; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
600; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
601; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
602; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
603; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
604; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
605; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
606; GFX10W64-NEXT:    s_endpgm
607;
608; GFX10W32-LABEL: add_i32_varying_vdata:
609; GFX10W32:       ; %bb.0: ; %entry
610; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
611; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
612; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
613; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
614; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
615; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
616; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
617; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
618; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
619; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
620; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
621; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
622; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
623; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
624; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
625; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
626; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
627; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
628; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
629; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
630; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
631; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
632; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
633; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
634; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
635; GFX10W32-NEXT:    ; implicit-def: $vgpr0
636; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
637; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
638; GFX10W32-NEXT:  ; %bb.1:
639; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
640; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
641; GFX10W32-NEXT:    s_mov_b32 s5, s6
642; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
644; GFX10W32-NEXT:  .LBB2_2:
645; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
646; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
647; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
648; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
649; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
650; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
651; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
652; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
653; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
654; GFX10W32-NEXT:    s_endpgm
655;
656; GFX11W64-LABEL: add_i32_varying_vdata:
657; GFX11W64:       ; %bb.0: ; %entry
658; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
659; GFX11W64-NEXT:    s_not_b64 exec, exec
660; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
661; GFX11W64-NEXT:    s_not_b64 exec, exec
662; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
663; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
664; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
665; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
666; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
667; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
668; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
669; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
670; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
671; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
672; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
673; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
674; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
675; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
676; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
677; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
678; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
679; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
680; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
681; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
682; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
683; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
684; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
685; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
686; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
687; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
688; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
689; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
690; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
691; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
692; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
693; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
694; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
695; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
696; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
697; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
698; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
699; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
700; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
701; GFX11W64-NEXT:    ; implicit-def: $vgpr0
702; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
703; GFX11W64-NEXT:    s_cbranch_execz .LBB2_2
704; GFX11W64-NEXT:  ; %bb.1:
705; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
706; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
707; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
708; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
709; GFX11W64-NEXT:  .LBB2_2:
710; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
711; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
712; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
713; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
714; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
715; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
716; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
717; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
718; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
719; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
720; GFX11W64-NEXT:    s_endpgm
721;
722; GFX11W32-LABEL: add_i32_varying_vdata:
723; GFX11W32:       ; %bb.0: ; %entry
724; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
725; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
726; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
727; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
728; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
729; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
730; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
731; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
732; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
733; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
734; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
735; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
736; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
737; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
738; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
739; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
740; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
741; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
742; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
743; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
744; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
745; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
746; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
747; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
748; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
749; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
750; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
751; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
752; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
753; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
754; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
755; GFX11W32-NEXT:    ; implicit-def: $vgpr0
756; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
757; GFX11W32-NEXT:    s_cbranch_execz .LBB2_2
758; GFX11W32-NEXT:  ; %bb.1:
759; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
760; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
761; GFX11W32-NEXT:    s_mov_b32 s5, s6
762; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, off, s[8:11], 0 glc
764; GFX11W32-NEXT:  .LBB2_2:
765; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
766; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
767; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
768; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
769; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
770; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
771; GFX11W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
772; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
773; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
774; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
775; GFX11W32-NEXT:    s_endpgm
776entry:
777  %lane = call i32 @llvm.amdgcn.workitem.id.x()
778  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
779  store i32 %old, i32 addrspace(1)* %out
780  ret void
781}
782
783define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) {
784; GFX6-LABEL: struct_add_i32_varying_vdata:
785; GFX6:       ; %bb.0: ; %entry
786; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x11
787; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
788; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
789; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX6-NEXT:    v_mov_b32_e32 v1, s2
791; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
792; GFX6-NEXT:    s_mov_b32 s3, 0xf000
793; GFX6-NEXT:    s_mov_b32 s2, -1
794; GFX6-NEXT:    s_waitcnt vmcnt(0)
795; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
796; GFX6-NEXT:    s_endpgm
797;
798; GFX8-LABEL: struct_add_i32_varying_vdata:
799; GFX8:       ; %bb.0: ; %entry
800; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
801; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
802; GFX8-NEXT:    v_mov_b32_e32 v1, 0
803; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
804; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
805; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
806; GFX8-NEXT:    v_mov_b32_e32 v2, v0
807; GFX8-NEXT:    s_not_b64 exec, exec
808; GFX8-NEXT:    v_mov_b32_e32 v2, 0
809; GFX8-NEXT:    s_not_b64 exec, exec
810; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
811; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
812; GFX8-NEXT:    s_nop 1
813; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
814; GFX8-NEXT:    s_nop 1
815; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
816; GFX8-NEXT:    s_nop 1
817; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
818; GFX8-NEXT:    s_nop 1
819; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
820; GFX8-NEXT:    s_nop 1
821; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
822; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
823; GFX8-NEXT:    s_nop 0
824; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
825; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
826; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
827; GFX8-NEXT:    ; implicit-def: $vgpr0
828; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
829; GFX8-NEXT:    s_cbranch_execz .LBB3_2
830; GFX8-NEXT:  ; %bb.1:
831; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x44
832; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
833; GFX8-NEXT:    v_mov_b32_e32 v0, s6
834; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
835; GFX8-NEXT:    v_mov_b32_e32 v3, s7
836; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
837; GFX8-NEXT:  .LBB3_2:
838; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
839; GFX8-NEXT:    s_waitcnt vmcnt(0)
840; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
841; GFX8-NEXT:    v_mov_b32_e32 v0, v1
842; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX8-NEXT:    v_mov_b32_e32 v4, s3
844; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
845; GFX8-NEXT:    v_mov_b32_e32 v3, s2
846; GFX8-NEXT:    flat_store_dword v[3:4], v0
847; GFX8-NEXT:    s_endpgm
848;
849; GFX9-LABEL: struct_add_i32_varying_vdata:
850; GFX9:       ; %bb.0: ; %entry
851; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
852; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
853; GFX9-NEXT:    v_mov_b32_e32 v1, 0
854; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
855; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
856; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
857; GFX9-NEXT:    v_mov_b32_e32 v2, v0
858; GFX9-NEXT:    s_not_b64 exec, exec
859; GFX9-NEXT:    v_mov_b32_e32 v2, 0
860; GFX9-NEXT:    s_not_b64 exec, exec
861; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
862; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
863; GFX9-NEXT:    s_nop 1
864; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
865; GFX9-NEXT:    s_nop 1
866; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
867; GFX9-NEXT:    s_nop 1
868; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
869; GFX9-NEXT:    s_nop 1
870; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
871; GFX9-NEXT:    s_nop 1
872; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
873; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
874; GFX9-NEXT:    s_nop 0
875; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
876; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
877; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
878; GFX9-NEXT:    ; implicit-def: $vgpr0
879; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
880; GFX9-NEXT:    s_cbranch_execz .LBB3_2
881; GFX9-NEXT:  ; %bb.1:
882; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x44
883; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
884; GFX9-NEXT:    v_mov_b32_e32 v0, s6
885; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
886; GFX9-NEXT:    v_mov_b32_e32 v3, s7
887; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
888; GFX9-NEXT:  .LBB3_2:
889; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
890; GFX9-NEXT:    s_waitcnt vmcnt(0)
891; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
892; GFX9-NEXT:    v_mov_b32_e32 v0, v1
893; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
894; GFX9-NEXT:    v_mov_b32_e32 v3, 0
895; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
896; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
897; GFX9-NEXT:    s_endpgm
898;
899; GFX10W64-LABEL: struct_add_i32_varying_vdata:
900; GFX10W64:       ; %bb.0: ; %entry
901; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
902; GFX10W64-NEXT:    s_not_b64 exec, exec
903; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
904; GFX10W64-NEXT:    s_not_b64 exec, exec
905; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
906; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
907; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
908; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
909; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
910; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
911; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
912; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
913; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
914; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
915; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
916; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
917; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
918; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
919; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
920; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
921; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
922; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
923; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
924; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
925; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
926; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
927; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
928; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
929; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
930; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
931; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
932; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
933; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
934; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
935; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
936; GFX10W64-NEXT:    ; implicit-def: $vgpr0
937; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
938; GFX10W64-NEXT:    s_cbranch_execz .LBB3_2
939; GFX10W64-NEXT:  ; %bb.1:
940; GFX10W64-NEXT:    s_clause 0x1
941; GFX10W64-NEXT:    s_load_dword s7, s[0:1], 0x44
942; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
943; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
944; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
945; GFX10W64-NEXT:    v_mov_b32_e32 v4, s7
946; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
947; GFX10W64-NEXT:  .LBB3_2:
948; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
949; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
950; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
951; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
952; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
953; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
954; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
955; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
956; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
957; GFX10W64-NEXT:    s_endpgm
958;
959; GFX10W32-LABEL: struct_add_i32_varying_vdata:
960; GFX10W32:       ; %bb.0: ; %entry
961; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
962; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
963; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
964; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
965; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
966; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
967; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
968; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
969; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
970; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
971; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
972; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
973; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
974; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
975; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
976; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
977; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
978; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
979; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
980; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
981; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
982; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
983; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
984; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
985; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
986; GFX10W32-NEXT:    ; implicit-def: $vgpr0
987; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
988; GFX10W32-NEXT:    s_cbranch_execz .LBB3_2
989; GFX10W32-NEXT:  ; %bb.1:
990; GFX10W32-NEXT:    s_mov_b32 s5, s6
991; GFX10W32-NEXT:    s_clause 0x1
992; GFX10W32-NEXT:    s_load_dword s6, s[0:1], 0x44
993; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
994; GFX10W32-NEXT:    v_mov_b32_e32 v0, s5
995; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
996; GFX10W32-NEXT:    v_mov_b32_e32 v4, s6
997; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
998; GFX10W32-NEXT:  .LBB3_2:
999; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1000; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1001; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1002; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1003; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1004; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1005; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1006; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1007; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1008; GFX10W32-NEXT:    s_endpgm
1009;
1010; GFX11W64-LABEL: struct_add_i32_varying_vdata:
1011; GFX11W64:       ; %bb.0: ; %entry
1012; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
1013; GFX11W64-NEXT:    s_not_b64 exec, exec
1014; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1015; GFX11W64-NEXT:    s_not_b64 exec, exec
1016; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1017; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1018; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1019; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
1020; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1021; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1022; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1023; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1024; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1025; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
1026; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1027; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1028; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1029; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
1030; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1031; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
1032; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1033; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1034; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
1035; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1036; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
1037; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1038; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1039; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
1040; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
1041; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1042; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1043; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1044; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1045; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
1046; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
1047; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
1048; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1049; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1050; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1051; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1052; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
1053; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1054; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1055; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1056; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1057; GFX11W64-NEXT:    s_cbranch_execz .LBB3_2
1058; GFX11W64-NEXT:  ; %bb.1:
1059; GFX11W64-NEXT:    s_clause 0x1
1060; GFX11W64-NEXT:    s_load_b32 s7, s[0:1], 0x44
1061; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1062; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
1063; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1064; GFX11W64-NEXT:    v_mov_b32_e32 v4, s7
1065; GFX11W64-NEXT:    buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
1066; GFX11W64-NEXT:  .LBB3_2:
1067; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1068; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1069; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
1070; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
1071; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
1072; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1073; GFX11W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1074; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1075; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
1076; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1077; GFX11W64-NEXT:    s_endpgm
1078;
1079; GFX11W32-LABEL: struct_add_i32_varying_vdata:
1080; GFX11W32:       ; %bb.0: ; %entry
1081; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
1082; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1083; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1084; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1085; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
1086; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1087; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1088; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1089; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1090; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1091; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1092; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1093; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
1094; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1095; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
1096; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1097; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1098; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1099; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1100; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
1101; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
1102; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1103; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1104; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
1105; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1106; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1107; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1108; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1109; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
1110; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1111; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1112; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1113; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1114; GFX11W32-NEXT:    s_cbranch_execz .LBB3_2
1115; GFX11W32-NEXT:  ; %bb.1:
1116; GFX11W32-NEXT:    s_mov_b32 s5, s6
1117; GFX11W32-NEXT:    s_clause 0x1
1118; GFX11W32-NEXT:    s_load_b32 s6, s[0:1], 0x44
1119; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1120; GFX11W32-NEXT:    v_mov_b32_e32 v0, s5
1121; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1122; GFX11W32-NEXT:    v_mov_b32_e32 v4, s6
1123; GFX11W32-NEXT:    buffer_atomic_add_u32 v0, v4, s[8:11], 0 idxen glc
1124; GFX11W32-NEXT:  .LBB3_2:
1125; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1126; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1127; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
1128; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
1129; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
1130; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1131; GFX11W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
1132; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1133; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
1134; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1135; GFX11W32-NEXT:    s_endpgm
1136entry:
1137  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1138  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0)
1139  store i32 %old, i32 addrspace(1)* %out
1140  ret void
1141}
1142
1143define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1144; GFX6-LABEL: add_i32_varying_offset:
1145; GFX6:       ; %bb.0: ; %entry
1146; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1147; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1148; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1149; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1150; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
1151; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1152; GFX6-NEXT:    s_mov_b32 s2, -1
1153; GFX6-NEXT:    s_waitcnt vmcnt(0)
1154; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1155; GFX6-NEXT:    s_endpgm
1156;
1157; GFX8-LABEL: add_i32_varying_offset:
1158; GFX8:       ; %bb.0: ; %entry
1159; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1160; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1161; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1162; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1163; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
1164; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1165; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1166; GFX8-NEXT:    s_waitcnt vmcnt(0)
1167; GFX8-NEXT:    flat_store_dword v[0:1], v2
1168; GFX8-NEXT:    s_endpgm
1169;
1170; GFX9-LABEL: add_i32_varying_offset:
1171; GFX9:       ; %bb.0: ; %entry
1172; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1173; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1174; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1175; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1176; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
1177; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1178; GFX9-NEXT:    s_waitcnt vmcnt(0)
1179; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1180; GFX9-NEXT:    s_endpgm
1181;
1182; GFX10-LABEL: add_i32_varying_offset:
1183; GFX10:       ; %bb.0: ; %entry
1184; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1185; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1186; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1187; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1188; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
1189; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1190; GFX10-NEXT:    s_waitcnt vmcnt(0)
1191; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1192; GFX10-NEXT:    s_endpgm
1193;
1194; GFX11-LABEL: add_i32_varying_offset:
1195; GFX11:       ; %bb.0: ; %entry
1196; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
1197; GFX11-NEXT:    v_mov_b32_e32 v1, 1
1198; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
1199; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1200; GFX11-NEXT:    buffer_atomic_add_u32 v1, v0, s[4:7], 0 offen glc
1201; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1202; GFX11-NEXT:    s_waitcnt vmcnt(0)
1203; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
1204; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1205; GFX11-NEXT:    s_endpgm
1206entry:
1207  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1208  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1209  store i32 %old, i32 addrspace(1)* %out
1210  ret void
1211}
1212
1213define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
1214; GFX6-LABEL: sub_i32_constant:
1215; GFX6:       ; %bb.0: ; %entry
1216; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1217; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1218; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1219; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1220; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1221; GFX6-NEXT:    ; implicit-def: $vgpr1
1222; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1223; GFX6-NEXT:    s_cbranch_execz .LBB5_2
1224; GFX6-NEXT:  ; %bb.1:
1225; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
1226; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
1227; GFX6-NEXT:    s_mul_i32 s0, s0, 5
1228; GFX6-NEXT:    v_mov_b32_e32 v1, s0
1229; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1230; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1231; GFX6-NEXT:  .LBB5_2:
1232; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
1233; GFX6-NEXT:    s_waitcnt vmcnt(0)
1234; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
1235; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1236; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1237; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1238; GFX6-NEXT:    s_mov_b32 s6, -1
1239; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1240; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1241; GFX6-NEXT:    s_endpgm
1242;
1243; GFX8-LABEL: sub_i32_constant:
1244; GFX8:       ; %bb.0: ; %entry
1245; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1246; GFX8-NEXT:    s_mov_b64 s[6:7], exec
1247; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1248; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1249; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1250; GFX8-NEXT:    ; implicit-def: $vgpr1
1251; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1252; GFX8-NEXT:    s_cbranch_execz .LBB5_2
1253; GFX8-NEXT:  ; %bb.1:
1254; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1255; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1256; GFX8-NEXT:    s_mul_i32 s0, s0, 5
1257; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1258; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1259; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1260; GFX8-NEXT:  .LBB5_2:
1261; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1262; GFX8-NEXT:    s_waitcnt vmcnt(0)
1263; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1264; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1265; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1266; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1267; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1268; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1269; GFX8-NEXT:    flat_store_dword v[0:1], v2
1270; GFX8-NEXT:    s_endpgm
1271;
1272; GFX9-LABEL: sub_i32_constant:
1273; GFX9:       ; %bb.0: ; %entry
1274; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1275; GFX9-NEXT:    s_mov_b64 s[6:7], exec
1276; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1277; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1278; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1279; GFX9-NEXT:    ; implicit-def: $vgpr1
1280; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1281; GFX9-NEXT:    s_cbranch_execz .LBB5_2
1282; GFX9-NEXT:  ; %bb.1:
1283; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1284; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1285; GFX9-NEXT:    s_mul_i32 s0, s0, 5
1286; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1287; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1288; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1289; GFX9-NEXT:  .LBB5_2:
1290; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1291; GFX9-NEXT:    s_waitcnt vmcnt(0)
1292; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1293; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1294; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1295; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1296; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1297; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1298; GFX9-NEXT:    s_endpgm
1299;
1300; GFX10W64-LABEL: sub_i32_constant:
1301; GFX10W64:       ; %bb.0: ; %entry
1302; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1303; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
1304; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1305; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1306; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1307; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1308; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1309; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
1310; GFX10W64-NEXT:  ; %bb.1:
1311; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1312; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1313; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
1314; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1315; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1316; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1317; GFX10W64-NEXT:  .LBB5_2:
1318; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1319; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1320; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1321; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1322; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1323; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1324; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1325; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1327; GFX10W64-NEXT:    s_endpgm
1328;
1329; GFX10W32-LABEL: sub_i32_constant:
1330; GFX10W32:       ; %bb.0: ; %entry
1331; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1332; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
1333; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1334; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1335; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1336; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1337; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
1338; GFX10W32-NEXT:  ; %bb.1:
1339; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1340; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
1341; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
1342; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1343; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1344; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1345; GFX10W32-NEXT:  .LBB5_2:
1346; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1347; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1348; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1349; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1350; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1351; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1352; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1353; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1354; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1355; GFX10W32-NEXT:    s_endpgm
1356;
1357; GFX11W64-LABEL: sub_i32_constant:
1358; GFX11W64:       ; %bb.0: ; %entry
1359; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1360; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1361; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1362; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1363; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1364; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1365; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
1366; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1367; GFX11W64-NEXT:    s_cbranch_execz .LBB5_2
1368; GFX11W64-NEXT:  ; %bb.1:
1369; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1370; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
1371; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1372; GFX11W64-NEXT:    s_mul_i32 s0, s0, 5
1373; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1374; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1376; GFX11W64-NEXT:  .LBB5_2:
1377; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1378; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1379; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1380; GFX11W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1381; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1382; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1383; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1384; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1386; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1387; GFX11W64-NEXT:    s_endpgm
1388;
1389; GFX11W32-LABEL: sub_i32_constant:
1390; GFX11W32:       ; %bb.0: ; %entry
1391; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1392; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1393; GFX11W32-NEXT:    s_mov_b32 s4, exec_lo
1394; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
1395; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1396; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1397; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1398; GFX11W32-NEXT:    s_cbranch_execz .LBB5_2
1399; GFX11W32-NEXT:  ; %bb.1:
1400; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1401; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s5
1402; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1403; GFX11W32-NEXT:    s_mul_i32 s0, s0, 5
1404; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1405; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1406; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1407; GFX11W32-NEXT:  .LBB5_2:
1408; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1409; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1410; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1411; GFX11W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
1412; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1413; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1414; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1415; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1416; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1417; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1418; GFX11W32-NEXT:    s_endpgm
1419entry:
1420  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
1421  store i32 %old, i32 addrspace(1)* %out
1422  ret void
1423}
1424
1425define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
1426; GFX6-LABEL: sub_i32_uniform:
1427; GFX6:       ; %bb.0: ; %entry
1428; GFX6-NEXT:    s_mov_b64 s[2:3], exec
1429; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
1430; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
1431; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
1432; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
1433; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1434; GFX6-NEXT:    ; implicit-def: $vgpr1
1435; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1436; GFX6-NEXT:    s_cbranch_execz .LBB6_2
1437; GFX6-NEXT:  ; %bb.1:
1438; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
1439; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
1440; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1441; GFX6-NEXT:    s_mul_i32 s0, s8, s0
1442; GFX6-NEXT:    v_mov_b32_e32 v1, s0
1443; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1444; GFX6-NEXT:  .LBB6_2:
1445; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
1446; GFX6-NEXT:    s_waitcnt vmcnt(0)
1447; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
1448; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
1450; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1451; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1452; GFX6-NEXT:    s_mov_b32 s6, -1
1453; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1454; GFX6-NEXT:    s_endpgm
1455;
1456; GFX8-LABEL: sub_i32_uniform:
1457; GFX8:       ; %bb.0: ; %entry
1458; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1459; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
1460; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1461; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1462; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1463; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1464; GFX8-NEXT:    ; implicit-def: $vgpr1
1465; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1466; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1467; GFX8-NEXT:  ; %bb.1:
1468; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1469; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1470; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1471; GFX8-NEXT:    s_mul_i32 s0, s8, s0
1472; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1473; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1474; GFX8-NEXT:  .LBB6_2:
1475; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1476; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1477; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1478; GFX8-NEXT:    s_waitcnt vmcnt(0)
1479; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1480; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1481; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1482; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1483; GFX8-NEXT:    flat_store_dword v[0:1], v2
1484; GFX8-NEXT:    s_endpgm
1485;
1486; GFX9-LABEL: sub_i32_uniform:
1487; GFX9:       ; %bb.0: ; %entry
1488; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1489; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
1490; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1491; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1492; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1493; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1494; GFX9-NEXT:    ; implicit-def: $vgpr1
1495; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1496; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1497; GFX9-NEXT:  ; %bb.1:
1498; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1499; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1500; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1501; GFX9-NEXT:    s_mul_i32 s0, s8, s0
1502; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1503; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1504; GFX9-NEXT:  .LBB6_2:
1505; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1506; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1507; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1508; GFX9-NEXT:    s_waitcnt vmcnt(0)
1509; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1510; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1511; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1512; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1513; GFX9-NEXT:    s_endpgm
1514;
1515; GFX10W64-LABEL: sub_i32_uniform:
1516; GFX10W64:       ; %bb.0: ; %entry
1517; GFX10W64-NEXT:    s_clause 0x1
1518; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1519; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
1520; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
1521; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1522; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1523; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1524; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1525; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1526; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1527; GFX10W64-NEXT:  ; %bb.1:
1528; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1529; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1530; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1531; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
1532; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1533; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1534; GFX10W64-NEXT:  .LBB6_2:
1535; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1536; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1537; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1538; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1539; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1540; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1541; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1542; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1543; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1544; GFX10W64-NEXT:    s_endpgm
1545;
1546; GFX10W32-LABEL: sub_i32_uniform:
1547; GFX10W32:       ; %bb.0: ; %entry
1548; GFX10W32-NEXT:    s_clause 0x1
1549; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1550; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
1551; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
1552; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1553; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1554; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1555; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1556; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1557; GFX10W32-NEXT:  ; %bb.1:
1558; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1559; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1560; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
1562; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1563; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1564; GFX10W32-NEXT:  .LBB6_2:
1565; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1566; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1567; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1568; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1569; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1570; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1571; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1572; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1573; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1574; GFX10W32-NEXT:    s_endpgm
1575;
1576; GFX11W64-LABEL: sub_i32_uniform:
1577; GFX11W64:       ; %bb.0: ; %entry
1578; GFX11W64-NEXT:    s_clause 0x1
1579; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1580; GFX11W64-NEXT:    s_load_b32 s8, s[0:1], 0x44
1581; GFX11W64-NEXT:    s_mov_b64 s[4:5], exec
1582; GFX11W64-NEXT:    s_mov_b64 s[6:7], exec
1583; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1584; GFX11W64-NEXT:    ; implicit-def: $vgpr1
1585; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1586; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1587; GFX11W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
1588; GFX11W64-NEXT:    s_cbranch_execz .LBB6_2
1589; GFX11W64-NEXT:  ; %bb.1:
1590; GFX11W64-NEXT:    s_load_b128 s[12:15], s[0:1], 0x34
1591; GFX11W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1592; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1593; GFX11W64-NEXT:    s_mul_i32 s0, s8, s0
1594; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1595; GFX11W64-NEXT:    v_mov_b32_e32 v1, s0
1596; GFX11W64-NEXT:    buffer_atomic_sub_u32 v1, off, s[12:15], 0 glc
1597; GFX11W64-NEXT:  .LBB6_2:
1598; GFX11W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1599; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1600; GFX11W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1601; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1602; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v1
1603; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1604; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1605; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1606; GFX11W64-NEXT:    global_store_b32 v1, v0, s[2:3]
1607; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1608; GFX11W64-NEXT:    s_endpgm
1609;
1610; GFX11W32-LABEL: sub_i32_uniform:
1611; GFX11W32:       ; %bb.0: ; %entry
1612; GFX11W32-NEXT:    s_clause 0x1
1613; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1614; GFX11W32-NEXT:    s_load_b32 s4, s[0:1], 0x44
1615; GFX11W32-NEXT:    s_mov_b32 s6, exec_lo
1616; GFX11W32-NEXT:    s_mov_b32 s5, exec_lo
1617; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1618; GFX11W32-NEXT:    ; implicit-def: $vgpr1
1619; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1620; GFX11W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
1621; GFX11W32-NEXT:    s_cbranch_execz .LBB6_2
1622; GFX11W32-NEXT:  ; %bb.1:
1623; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1624; GFX11W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1625; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1626; GFX11W32-NEXT:    s_mul_i32 s0, s4, s0
1627; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1628; GFX11W32-NEXT:    v_mov_b32_e32 v1, s0
1629; GFX11W32-NEXT:    buffer_atomic_sub_u32 v1, off, s[8:11], 0 glc
1630; GFX11W32-NEXT:  .LBB6_2:
1631; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1632; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1633; GFX11W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1634; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1635; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v1
1636; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1637; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1638; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1639; GFX11W32-NEXT:    global_store_b32 v1, v0, s[2:3]
1640; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1641; GFX11W32-NEXT:    s_endpgm
1642entry:
1643  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
1644  store i32 %old, i32 addrspace(1)* %out
1645  ret void
1646}
1647
1648define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
1649; GFX6-LABEL: sub_i32_varying_vdata:
1650; GFX6:       ; %bb.0: ; %entry
1651; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1652; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1653; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1654; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1655; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1656; GFX6-NEXT:    s_mov_b32 s2, -1
1657; GFX6-NEXT:    s_waitcnt vmcnt(0)
1658; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1659; GFX6-NEXT:    s_endpgm
1660;
1661; GFX8-LABEL: sub_i32_varying_vdata:
1662; GFX8:       ; %bb.0: ; %entry
1663; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1664; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1665; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1666; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1667; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1668; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1669; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1670; GFX8-NEXT:    s_not_b64 exec, exec
1671; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1672; GFX8-NEXT:    s_not_b64 exec, exec
1673; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1674; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1675; GFX8-NEXT:    s_nop 1
1676; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1677; GFX8-NEXT:    s_nop 1
1678; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1679; GFX8-NEXT:    s_nop 1
1680; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1681; GFX8-NEXT:    s_nop 1
1682; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1683; GFX8-NEXT:    s_nop 1
1684; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1685; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1686; GFX8-NEXT:    s_nop 0
1687; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1688; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1689; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1690; GFX8-NEXT:    ; implicit-def: $vgpr0
1691; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1692; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1693; GFX8-NEXT:  ; %bb.1:
1694; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1695; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1696; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1697; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1698; GFX8-NEXT:  .LBB7_2:
1699; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1700; GFX8-NEXT:    s_waitcnt vmcnt(0)
1701; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1702; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1703; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1704; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1705; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1706; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1707; GFX8-NEXT:    flat_store_dword v[3:4], v0
1708; GFX8-NEXT:    s_endpgm
1709;
1710; GFX9-LABEL: sub_i32_varying_vdata:
1711; GFX9:       ; %bb.0: ; %entry
1712; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1713; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1714; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1715; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1716; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1717; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1718; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1719; GFX9-NEXT:    s_not_b64 exec, exec
1720; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1721; GFX9-NEXT:    s_not_b64 exec, exec
1722; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1723; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1724; GFX9-NEXT:    s_nop 1
1725; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1726; GFX9-NEXT:    s_nop 1
1727; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1728; GFX9-NEXT:    s_nop 1
1729; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1730; GFX9-NEXT:    s_nop 1
1731; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1732; GFX9-NEXT:    s_nop 1
1733; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1734; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1735; GFX9-NEXT:    s_nop 0
1736; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1737; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1738; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1739; GFX9-NEXT:    ; implicit-def: $vgpr0
1740; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1741; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1742; GFX9-NEXT:  ; %bb.1:
1743; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1744; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1745; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1746; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1747; GFX9-NEXT:  .LBB7_2:
1748; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1749; GFX9-NEXT:    s_waitcnt vmcnt(0)
1750; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1751; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1752; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1753; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1754; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1755; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1756; GFX9-NEXT:    s_endpgm
1757;
1758; GFX10W64-LABEL: sub_i32_varying_vdata:
1759; GFX10W64:       ; %bb.0: ; %entry
1760; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1761; GFX10W64-NEXT:    s_not_b64 exec, exec
1762; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1763; GFX10W64-NEXT:    s_not_b64 exec, exec
1764; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1765; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1766; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1767; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1768; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1769; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1770; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1771; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1772; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1773; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1774; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1775; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1776; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1777; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1778; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1779; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1780; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1781; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1782; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1783; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1784; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1785; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1786; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1787; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1788; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1789; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1790; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1791; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1792; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1793; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1794; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1795; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1796; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1797; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1798; GFX10W64-NEXT:  ; %bb.1:
1799; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1800; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1801; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1802; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1803; GFX10W64-NEXT:  .LBB7_2:
1804; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1805; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1806; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1807; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1808; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1809; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1810; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1811; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1812; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1813; GFX10W64-NEXT:    s_endpgm
1814;
1815; GFX10W32-LABEL: sub_i32_varying_vdata:
1816; GFX10W32:       ; %bb.0: ; %entry
1817; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1818; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1819; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1820; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1821; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1822; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1823; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1824; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1825; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1826; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1827; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1828; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1829; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1830; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1831; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1832; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1833; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1834; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1835; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1836; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1837; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1838; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1839; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1840; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1841; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1842; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1843; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1844; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1845; GFX10W32-NEXT:  ; %bb.1:
1846; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1847; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1848; GFX10W32-NEXT:    s_mov_b32 s5, s6
1849; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1850; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1851; GFX10W32-NEXT:  .LBB7_2:
1852; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1853; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1854; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1855; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1856; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1857; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1858; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1859; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1860; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1861; GFX10W32-NEXT:    s_endpgm
1862;
1863; GFX11W64-LABEL: sub_i32_varying_vdata:
1864; GFX11W64:       ; %bb.0: ; %entry
1865; GFX11W64-NEXT:    v_mov_b32_e32 v1, v0
1866; GFX11W64-NEXT:    s_not_b64 exec, exec
1867; GFX11W64-NEXT:    v_mov_b32_e32 v1, 0
1868; GFX11W64-NEXT:    s_not_b64 exec, exec
1869; GFX11W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1870; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1871; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1872; GFX11W64-NEXT:    v_mov_b32_e32 v3, 0
1873; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1874; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1875; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1876; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1877; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1878; GFX11W64-NEXT:    v_mov_b32_e32 v2, v1
1879; GFX11W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1880; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1881; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1882; GFX11W64-NEXT:    v_readlane_b32 s4, v1, 31
1883; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1884; GFX11W64-NEXT:    v_mov_b32_e32 v2, s4
1885; GFX11W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1886; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1887; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 15
1888; GFX11W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1889; GFX11W64-NEXT:    s_mov_b64 exec, s[2:3]
1890; GFX11W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1891; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1892; GFX11W64-NEXT:    v_readlane_b32 s7, v1, 31
1893; GFX11W64-NEXT:    v_writelane_b32 v3, s6, 16
1894; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1895; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1896; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1897; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1898; GFX11W64-NEXT:    v_readlane_b32 s6, v1, 63
1899; GFX11W64-NEXT:    v_readlane_b32 s8, v1, 47
1900; GFX11W64-NEXT:    v_writelane_b32 v3, s7, 32
1901; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1902; GFX11W64-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1903; GFX11W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1904; GFX11W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1905; GFX11W64-NEXT:    v_writelane_b32 v3, s8, 48
1906; GFX11W64-NEXT:    s_mov_b64 exec, s[4:5]
1907; GFX11W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1908; GFX11W64-NEXT:    ; implicit-def: $vgpr0
1909; GFX11W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1910; GFX11W64-NEXT:    s_cbranch_execz .LBB7_2
1911; GFX11W64-NEXT:  ; %bb.1:
1912; GFX11W64-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1913; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
1914; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1915; GFX11W64-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
1916; GFX11W64-NEXT:  .LBB7_2:
1917; GFX11W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1918; GFX11W64-NEXT:    s_waitcnt vmcnt(0)
1919; GFX11W64-NEXT:    v_readfirstlane_b32 s0, v0
1920; GFX11W64-NEXT:    v_mov_b32_e32 v0, v3
1921; GFX11W64-NEXT:    v_mov_b32_e32 v4, 0
1922; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1923; GFX11W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1924; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
1925; GFX11W64-NEXT:    global_store_b32 v4, v0, s[2:3]
1926; GFX11W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1927; GFX11W64-NEXT:    s_endpgm
1928;
1929; GFX11W32-LABEL: sub_i32_varying_vdata:
1930; GFX11W32:       ; %bb.0: ; %entry
1931; GFX11W32-NEXT:    v_mov_b32_e32 v1, v0
1932; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1933; GFX11W32-NEXT:    v_mov_b32_e32 v1, 0
1934; GFX11W32-NEXT:    s_not_b32 exec_lo, exec_lo
1935; GFX11W32-NEXT:    s_or_saveexec_b32 s2, -1
1936; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1937; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1938; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1939; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1940; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1941; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1942; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1943; GFX11W32-NEXT:    v_mov_b32_e32 v2, v1
1944; GFX11W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1945; GFX11W32-NEXT:    s_mov_b32 exec_lo, s2
1946; GFX11W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
1947; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1948; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1949; GFX11W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1950; GFX11W32-NEXT:    v_mov_b32_e32 v3, 0
1951; GFX11W32-NEXT:    v_readlane_b32 s6, v1, 31
1952; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
1953; GFX11W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1954; GFX11W32-NEXT:    v_readlane_b32 s5, v1, 15
1955; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1956; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1957; GFX11W32-NEXT:    s_or_saveexec_b32 s4, -1
1958; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1959; GFX11W32-NEXT:    v_writelane_b32 v3, s5, 16
1960; GFX11W32-NEXT:    s_mov_b32 exec_lo, s4
1961; GFX11W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1962; GFX11W32-NEXT:    ; implicit-def: $vgpr0
1963; GFX11W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1964; GFX11W32-NEXT:    s_cbranch_execz .LBB7_2
1965; GFX11W32-NEXT:  ; %bb.1:
1966; GFX11W32-NEXT:    s_load_b128 s[8:11], s[0:1], 0x34
1967; GFX11W32-NEXT:    v_mov_b32_e32 v0, s6
1968; GFX11W32-NEXT:    s_mov_b32 s5, s6
1969; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1970; GFX11W32-NEXT:    buffer_atomic_sub_u32 v0, off, s[8:11], 0 glc
1971; GFX11W32-NEXT:  .LBB7_2:
1972; GFX11W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1973; GFX11W32-NEXT:    s_waitcnt vmcnt(0)
1974; GFX11W32-NEXT:    v_readfirstlane_b32 s0, v0
1975; GFX11W32-NEXT:    v_mov_b32_e32 v0, v3
1976; GFX11W32-NEXT:    v_mov_b32_e32 v4, 0
1977; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1978; GFX11W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1979; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
1980; GFX11W32-NEXT:    global_store_b32 v4, v0, s[2:3]
1981; GFX11W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1982; GFX11W32-NEXT:    s_endpgm
1983entry:
1984  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1985  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1986  store i32 %old, i32 addrspace(1)* %out
1987  ret void
1988}
1989
1990define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1991; GFX6-LABEL: sub_i32_varying_offset:
1992; GFX6:       ; %bb.0: ; %entry
1993; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1994; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1995; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1996; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1997; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1998; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1999; GFX6-NEXT:    s_mov_b32 s2, -1
2000; GFX6-NEXT:    s_waitcnt vmcnt(0)
2001; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
2002; GFX6-NEXT:    s_endpgm
2003;
2004; GFX8-LABEL: sub_i32_varying_offset:
2005; GFX8:       ; %bb.0: ; %entry
2006; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
2007; GFX8-NEXT:    v_mov_b32_e32 v2, 1
2008; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2009; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
2010; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
2011; GFX8-NEXT:    v_mov_b32_e32 v0, s0
2012; GFX8-NEXT:    v_mov_b32_e32 v1, s1
2013; GFX8-NEXT:    s_waitcnt vmcnt(0)
2014; GFX8-NEXT:    flat_store_dword v[0:1], v2
2015; GFX8-NEXT:    s_endpgm
2016;
2017; GFX9-LABEL: sub_i32_varying_offset:
2018; GFX9:       ; %bb.0: ; %entry
2019; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
2020; GFX9-NEXT:    v_mov_b32_e32 v1, 1
2021; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2022; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
2023; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
2024; GFX9-NEXT:    v_mov_b32_e32 v0, 0
2025; GFX9-NEXT:    s_waitcnt vmcnt(0)
2026; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
2027; GFX9-NEXT:    s_endpgm
2028;
2029; GFX10-LABEL: sub_i32_varying_offset:
2030; GFX10:       ; %bb.0: ; %entry
2031; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
2032; GFX10-NEXT:    v_mov_b32_e32 v1, 1
2033; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
2034; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2035; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
2036; GFX10-NEXT:    v_mov_b32_e32 v0, 0
2037; GFX10-NEXT:    s_waitcnt vmcnt(0)
2038; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
2039; GFX10-NEXT:    s_endpgm
2040;
2041; GFX11-LABEL: sub_i32_varying_offset:
2042; GFX11:       ; %bb.0: ; %entry
2043; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x34
2044; GFX11-NEXT:    v_mov_b32_e32 v1, 1
2045; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
2046; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2047; GFX11-NEXT:    buffer_atomic_sub_u32 v1, v0, s[4:7], 0 offen glc
2048; GFX11-NEXT:    v_mov_b32_e32 v0, 0
2049; GFX11-NEXT:    s_waitcnt vmcnt(0)
2050; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
2051; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
2052; GFX11-NEXT:    s_endpgm
2053entry:
2054  %lane = call i32 @llvm.amdgcn.workitem.id.x()
2055  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
2056  store i32 %old, i32 addrspace(1)* %out
2057  ret void
2058}
2059