1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
10declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg)
11declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg)
12
13; Show what the atomic optimization pass will do for raw buffers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
16; GFX6-LABEL: add_i32_constant:
17; GFX6:       ; %bb.0: ; %entry
18; GFX6-NEXT:    s_mov_b64 s[2:3], exec
19; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
20; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
21; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
22; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
23; GFX6-NEXT:    ; implicit-def: $vgpr1
24; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
25; GFX6-NEXT:    s_cbranch_execz .LBB0_2
26; GFX6-NEXT:  ; %bb.1:
27; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
28; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
29; GFX6-NEXT:    s_mul_i32 s0, s0, 5
30; GFX6-NEXT:    v_mov_b32_e32 v1, s0
31; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
33; GFX6-NEXT:  .LBB0_2:
34; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
37; GFX6-NEXT:    s_mov_b32 s7, 0xf000
38; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
39; GFX6-NEXT:    s_mov_b32 s6, -1
40; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
42; GFX6-NEXT:    s_endpgm
43;
44; GFX8-LABEL: add_i32_constant:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
47; GFX8-NEXT:    s_mov_b64 s[6:7], exec
48; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
49; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
50; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
51; GFX8-NEXT:    ; implicit-def: $vgpr1
52; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
53; GFX8-NEXT:    s_cbranch_execz .LBB0_2
54; GFX8-NEXT:  ; %bb.1:
55; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
56; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
57; GFX8-NEXT:    s_mul_i32 s0, s0, 5
58; GFX8-NEXT:    v_mov_b32_e32 v1, s0
59; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
61; GFX8-NEXT:  .LBB0_2:
62; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
63; GFX8-NEXT:    s_waitcnt vmcnt(0)
64; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
65; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
66; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX8-NEXT:    v_mov_b32_e32 v0, s2
68; GFX8-NEXT:    v_mov_b32_e32 v1, s3
69; GFX8-NEXT:    flat_store_dword v[0:1], v2
70; GFX8-NEXT:    s_endpgm
71;
72; GFX9-LABEL: add_i32_constant:
73; GFX9:       ; %bb.0: ; %entry
74; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
75; GFX9-NEXT:    s_mov_b64 s[6:7], exec
76; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
77; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
78; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
79; GFX9-NEXT:    ; implicit-def: $vgpr1
80; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
81; GFX9-NEXT:    s_cbranch_execz .LBB0_2
82; GFX9-NEXT:  ; %bb.1:
83; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
84; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
85; GFX9-NEXT:    s_mul_i32 s0, s0, 5
86; GFX9-NEXT:    v_mov_b32_e32 v1, s0
87; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
89; GFX9-NEXT:  .LBB0_2:
90; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
91; GFX9-NEXT:    s_waitcnt vmcnt(0)
92; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
93; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
94; GFX9-NEXT:    v_mov_b32_e32 v1, 0
95; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
97; GFX9-NEXT:    s_endpgm
98;
99; GFX10W64-LABEL: add_i32_constant:
100; GFX10W64:       ; %bb.0: ; %entry
101; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
102; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
103; GFX10W64-NEXT:    ; implicit-def: $vgpr1
104; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
105; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
106; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
107; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
108; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
109; GFX10W64-NEXT:  ; %bb.1:
110; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
111; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
112; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
113; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
114; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
116; GFX10W64-NEXT:  .LBB0_2:
117; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
118; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
119; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
120; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
121; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
122; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
123; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
125; GFX10W64-NEXT:    s_endpgm
126;
127; GFX10W32-LABEL: add_i32_constant:
128; GFX10W32:       ; %bb.0: ; %entry
129; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
130; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
131; GFX10W32-NEXT:    ; implicit-def: $vgpr1
132; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
133; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
134; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
135; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
136; GFX10W32-NEXT:  ; %bb.1:
137; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
138; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
139; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
140; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
141; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
143; GFX10W32-NEXT:  .LBB0_2:
144; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
145; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
146; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
147; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
148; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
149; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
150; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
152; GFX10W32-NEXT:    s_endpgm
153entry:
154  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
155  store i32 %old, i32 addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
160; GFX6-LABEL: add_i32_uniform:
161; GFX6:       ; %bb.0: ; %entry
162; GFX6-NEXT:    s_mov_b64 s[2:3], exec
163; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
164; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
165; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
166; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
167; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
168; GFX6-NEXT:    ; implicit-def: $vgpr1
169; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
170; GFX6-NEXT:    s_cbranch_execz .LBB1_2
171; GFX6-NEXT:  ; %bb.1:
172; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
173; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
174; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX6-NEXT:    s_mul_i32 s0, s8, s0
176; GFX6-NEXT:    v_mov_b32_e32 v1, s0
177; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
178; GFX6-NEXT:  .LBB1_2:
179; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
180; GFX6-NEXT:    s_waitcnt vmcnt(0)
181; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
182; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
184; GFX6-NEXT:    s_mov_b32 s7, 0xf000
185; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
186; GFX6-NEXT:    s_mov_b32 s6, -1
187; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
188; GFX6-NEXT:    s_endpgm
189;
190; GFX8-LABEL: add_i32_uniform:
191; GFX8:       ; %bb.0: ; %entry
192; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
193; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
194; GFX8-NEXT:    s_mov_b64 s[4:5], exec
195; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
196; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
197; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
198; GFX8-NEXT:    ; implicit-def: $vgpr1
199; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
200; GFX8-NEXT:    s_cbranch_execz .LBB1_2
201; GFX8-NEXT:  ; %bb.1:
202; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
203; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
204; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX8-NEXT:    s_mul_i32 s0, s8, s0
206; GFX8-NEXT:    v_mov_b32_e32 v1, s0
207; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
208; GFX8-NEXT:  .LBB1_2:
209; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
210; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
212; GFX8-NEXT:    s_waitcnt vmcnt(0)
213; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
214; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
215; GFX8-NEXT:    v_mov_b32_e32 v0, s2
216; GFX8-NEXT:    v_mov_b32_e32 v1, s3
217; GFX8-NEXT:    flat_store_dword v[0:1], v2
218; GFX8-NEXT:    s_endpgm
219;
220; GFX9-LABEL: add_i32_uniform:
221; GFX9:       ; %bb.0: ; %entry
222; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
223; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
224; GFX9-NEXT:    s_mov_b64 s[4:5], exec
225; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
226; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
227; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
228; GFX9-NEXT:    ; implicit-def: $vgpr1
229; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
230; GFX9-NEXT:    s_cbranch_execz .LBB1_2
231; GFX9-NEXT:  ; %bb.1:
232; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
233; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX9-NEXT:    s_mul_i32 s0, s8, s0
236; GFX9-NEXT:    v_mov_b32_e32 v1, s0
237; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
238; GFX9-NEXT:  .LBB1_2:
239; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
242; GFX9-NEXT:    s_waitcnt vmcnt(0)
243; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
244; GFX9-NEXT:    v_mov_b32_e32 v1, 0
245; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
246; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
247; GFX9-NEXT:    s_endpgm
248;
249; GFX10W64-LABEL: add_i32_uniform:
250; GFX10W64:       ; %bb.0: ; %entry
251; GFX10W64-NEXT:    s_clause 0x1
252; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
253; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
254; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
255; GFX10W64-NEXT:    ; implicit-def: $vgpr1
256; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
257; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
258; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
259; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
260; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
261; GFX10W64-NEXT:  ; %bb.1:
262; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
263; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
264; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
266; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
267; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
268; GFX10W64-NEXT:  .LBB1_2:
269; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
270; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
271; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
272; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
273; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1]
275; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
276; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
277; GFX10W64-NEXT:    s_endpgm
278;
279; GFX10W32-LABEL: add_i32_uniform:
280; GFX10W32:       ; %bb.0: ; %entry
281; GFX10W32-NEXT:    s_clause 0x1
282; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
283; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
284; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
285; GFX10W32-NEXT:    ; implicit-def: $vgpr1
286; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
287; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
288; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
289; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
290; GFX10W32-NEXT:  ; %bb.1:
291; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
292; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
293; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
294; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
295; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
296; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
297; GFX10W32-NEXT:  .LBB1_2:
298; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
299; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
300; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
301; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
302; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
303; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1]
304; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
305; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
306; GFX10W32-NEXT:    s_endpgm
307entry:
308  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
309  store i32 %old, i32 addrspace(1)* %out
310  ret void
311}
312
313define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
314; GFX6-LABEL: add_i32_varying_vdata:
315; GFX6:       ; %bb.0: ; %entry
316; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
317; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
318; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
319; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
320; GFX6-NEXT:    s_mov_b32 s3, 0xf000
321; GFX6-NEXT:    s_mov_b32 s2, -1
322; GFX6-NEXT:    s_waitcnt vmcnt(0)
323; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
324; GFX6-NEXT:    s_endpgm
325;
326; GFX8-LABEL: add_i32_varying_vdata:
327; GFX8:       ; %bb.0: ; %entry
328; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
329; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
330; GFX8-NEXT:    v_mov_b32_e32 v1, 0
331; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
332; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
333; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
334; GFX8-NEXT:    v_mov_b32_e32 v2, v0
335; GFX8-NEXT:    s_not_b64 exec, exec
336; GFX8-NEXT:    v_mov_b32_e32 v2, 0
337; GFX8-NEXT:    s_not_b64 exec, exec
338; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
339; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
340; GFX8-NEXT:    s_nop 1
341; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
342; GFX8-NEXT:    s_nop 1
343; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
344; GFX8-NEXT:    s_nop 1
345; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
346; GFX8-NEXT:    s_nop 1
347; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
348; GFX8-NEXT:    s_nop 1
349; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
350; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
351; GFX8-NEXT:    s_nop 0
352; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
353; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
354; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
355; GFX8-NEXT:    ; implicit-def: $vgpr0
356; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
357; GFX8-NEXT:    s_cbranch_execz .LBB2_2
358; GFX8-NEXT:  ; %bb.1:
359; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
360; GFX8-NEXT:    v_mov_b32_e32 v0, s6
361; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
362; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
363; GFX8-NEXT:  .LBB2_2:
364; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
365; GFX8-NEXT:    s_waitcnt vmcnt(0)
366; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
367; GFX8-NEXT:    v_mov_b32_e32 v0, v1
368; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX8-NEXT:    v_mov_b32_e32 v4, s3
370; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
371; GFX8-NEXT:    v_mov_b32_e32 v3, s2
372; GFX8-NEXT:    flat_store_dword v[3:4], v0
373; GFX8-NEXT:    s_endpgm
374;
375; GFX9-LABEL: add_i32_varying_vdata:
376; GFX9:       ; %bb.0: ; %entry
377; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
378; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
379; GFX9-NEXT:    v_mov_b32_e32 v1, 0
380; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
381; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
382; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
383; GFX9-NEXT:    v_mov_b32_e32 v2, v0
384; GFX9-NEXT:    s_not_b64 exec, exec
385; GFX9-NEXT:    v_mov_b32_e32 v2, 0
386; GFX9-NEXT:    s_not_b64 exec, exec
387; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
388; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
389; GFX9-NEXT:    s_nop 1
390; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
391; GFX9-NEXT:    s_nop 1
392; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
393; GFX9-NEXT:    s_nop 1
394; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
395; GFX9-NEXT:    s_nop 1
396; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
397; GFX9-NEXT:    s_nop 1
398; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
399; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
400; GFX9-NEXT:    s_nop 0
401; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
402; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
403; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
404; GFX9-NEXT:    ; implicit-def: $vgpr0
405; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
406; GFX9-NEXT:    s_cbranch_execz .LBB2_2
407; GFX9-NEXT:  ; %bb.1:
408; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
409; GFX9-NEXT:    v_mov_b32_e32 v0, s6
410; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
411; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
412; GFX9-NEXT:  .LBB2_2:
413; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
414; GFX9-NEXT:    s_waitcnt vmcnt(0)
415; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
416; GFX9-NEXT:    v_mov_b32_e32 v0, v1
417; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
418; GFX9-NEXT:    v_mov_b32_e32 v3, 0
419; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
421; GFX9-NEXT:    s_endpgm
422;
423; GFX10W64-LABEL: add_i32_varying_vdata:
424; GFX10W64:       ; %bb.0: ; %entry
425; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
426; GFX10W64-NEXT:    s_not_b64 exec, exec
427; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
428; GFX10W64-NEXT:    s_not_b64 exec, exec
429; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
430; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
431; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
432; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
434; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
435; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
436; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
437; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
438; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
439; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
440; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
441; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
442; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
443; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
444; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
445; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
446; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
447; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
448; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
449; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
450; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
451; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
452; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
453; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
454; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
455; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
456; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
457; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
458; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
459; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
460; GFX10W64-NEXT:    ; implicit-def: $vgpr0
461; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
462; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
463; GFX10W64-NEXT:  ; %bb.1:
464; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
465; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
466; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
467; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
468; GFX10W64-NEXT:  .LBB2_2:
469; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
470; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
471; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
472; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
473; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
474; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
475; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
476; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
477; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
478; GFX10W64-NEXT:    s_endpgm
479;
480; GFX10W32-LABEL: add_i32_varying_vdata:
481; GFX10W32:       ; %bb.0: ; %entry
482; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
483; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
484; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
485; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
486; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
487; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
488; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
489; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
490; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
491; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
492; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
493; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
494; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
495; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
496; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
497; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
498; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
499; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
500; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
501; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
502; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
503; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
504; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
505; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
506; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
507; GFX10W32-NEXT:    ; implicit-def: $vgpr0
508; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
509; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
510; GFX10W32-NEXT:  ; %bb.1:
511; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
512; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
513; GFX10W32-NEXT:    s_mov_b32 s5, s6
514; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
516; GFX10W32-NEXT:  .LBB2_2:
517; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
518; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
519; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
520; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
521; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
522; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
523; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
524; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
525; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
526; GFX10W32-NEXT:    s_endpgm
527entry:
528  %lane = call i32 @llvm.amdgcn.workitem.id.x()
529  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
530  store i32 %old, i32 addrspace(1)* %out
531  ret void
532}
533
534define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) {
535; GFX6-LABEL: struct_add_i32_varying_vdata:
536; GFX6:       ; %bb.0: ; %entry
537; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x11
538; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
539; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
540; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
541; GFX6-NEXT:    v_mov_b32_e32 v1, s2
542; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
543; GFX6-NEXT:    s_mov_b32 s3, 0xf000
544; GFX6-NEXT:    s_mov_b32 s2, -1
545; GFX6-NEXT:    s_waitcnt vmcnt(0)
546; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
547; GFX6-NEXT:    s_endpgm
548;
549; GFX8-LABEL: struct_add_i32_varying_vdata:
550; GFX8:       ; %bb.0: ; %entry
551; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
552; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
553; GFX8-NEXT:    v_mov_b32_e32 v1, 0
554; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
555; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
556; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
557; GFX8-NEXT:    v_mov_b32_e32 v2, v0
558; GFX8-NEXT:    s_not_b64 exec, exec
559; GFX8-NEXT:    v_mov_b32_e32 v2, 0
560; GFX8-NEXT:    s_not_b64 exec, exec
561; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
562; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
563; GFX8-NEXT:    s_nop 1
564; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
565; GFX8-NEXT:    s_nop 1
566; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX8-NEXT:    s_nop 1
568; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
569; GFX8-NEXT:    s_nop 1
570; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
571; GFX8-NEXT:    s_nop 1
572; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
573; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
574; GFX8-NEXT:    s_nop 0
575; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
576; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
577; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
578; GFX8-NEXT:    ; implicit-def: $vgpr0
579; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
580; GFX8-NEXT:    s_cbranch_execz .LBB3_2
581; GFX8-NEXT:  ; %bb.1:
582; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x44
583; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
584; GFX8-NEXT:    v_mov_b32_e32 v0, s6
585; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX8-NEXT:    v_mov_b32_e32 v3, s7
587; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
588; GFX8-NEXT:  .LBB3_2:
589; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
590; GFX8-NEXT:    s_waitcnt vmcnt(0)
591; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
592; GFX8-NEXT:    v_mov_b32_e32 v0, v1
593; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
594; GFX8-NEXT:    v_mov_b32_e32 v4, s3
595; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
596; GFX8-NEXT:    v_mov_b32_e32 v3, s2
597; GFX8-NEXT:    flat_store_dword v[3:4], v0
598; GFX8-NEXT:    s_endpgm
599;
600; GFX9-LABEL: struct_add_i32_varying_vdata:
601; GFX9:       ; %bb.0: ; %entry
602; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
603; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
604; GFX9-NEXT:    v_mov_b32_e32 v1, 0
605; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
606; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
607; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
608; GFX9-NEXT:    v_mov_b32_e32 v2, v0
609; GFX9-NEXT:    s_not_b64 exec, exec
610; GFX9-NEXT:    v_mov_b32_e32 v2, 0
611; GFX9-NEXT:    s_not_b64 exec, exec
612; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
613; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
614; GFX9-NEXT:    s_nop 1
615; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
616; GFX9-NEXT:    s_nop 1
617; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
618; GFX9-NEXT:    s_nop 1
619; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
620; GFX9-NEXT:    s_nop 1
621; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
622; GFX9-NEXT:    s_nop 1
623; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
624; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
625; GFX9-NEXT:    s_nop 0
626; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
627; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
628; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
629; GFX9-NEXT:    ; implicit-def: $vgpr0
630; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
631; GFX9-NEXT:    s_cbranch_execz .LBB3_2
632; GFX9-NEXT:  ; %bb.1:
633; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x44
634; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
635; GFX9-NEXT:    v_mov_b32_e32 v0, s6
636; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX9-NEXT:    v_mov_b32_e32 v3, s7
638; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
639; GFX9-NEXT:  .LBB3_2:
640; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
641; GFX9-NEXT:    s_waitcnt vmcnt(0)
642; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
643; GFX9-NEXT:    v_mov_b32_e32 v0, v1
644; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
645; GFX9-NEXT:    v_mov_b32_e32 v3, 0
646; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
648; GFX9-NEXT:    s_endpgm
649;
650; GFX10W64-LABEL: struct_add_i32_varying_vdata:
651; GFX10W64:       ; %bb.0: ; %entry
652; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
653; GFX10W64-NEXT:    s_not_b64 exec, exec
654; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
655; GFX10W64-NEXT:    s_not_b64 exec, exec
656; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
657; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
658; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
659; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
660; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
661; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
662; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
663; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
664; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
665; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
666; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
667; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
668; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
669; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
670; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
671; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
672; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
673; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
674; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
675; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
676; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
677; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
678; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
679; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
680; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
681; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
682; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
683; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
684; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
685; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
686; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
687; GFX10W64-NEXT:    ; implicit-def: $vgpr0
688; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
689; GFX10W64-NEXT:    s_cbranch_execz .LBB3_2
690; GFX10W64-NEXT:  ; %bb.1:
691; GFX10W64-NEXT:    s_clause 0x1
692; GFX10W64-NEXT:    s_load_dword s7, s[0:1], 0x44
693; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
694; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
695; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX10W64-NEXT:    v_mov_b32_e32 v4, s7
697; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
698; GFX10W64-NEXT:  .LBB3_2:
699; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
700; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
701; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
702; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
703; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
704; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
705; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
706; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
707; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
708; GFX10W64-NEXT:    s_endpgm
709;
710; GFX10W32-LABEL: struct_add_i32_varying_vdata:
711; GFX10W32:       ; %bb.0: ; %entry
712; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
713; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
714; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
715; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
716; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
717; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
718; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
719; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
720; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
721; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
722; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
723; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
724; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
725; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
726; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
727; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
728; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
729; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
730; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
731; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
732; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
733; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
734; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
735; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
736; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
737; GFX10W32-NEXT:    ; implicit-def: $vgpr0
738; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
739; GFX10W32-NEXT:    s_cbranch_execz .LBB3_2
740; GFX10W32-NEXT:  ; %bb.1:
741; GFX10W32-NEXT:    s_mov_b32 s5, s6
742; GFX10W32-NEXT:    s_clause 0x1
743; GFX10W32-NEXT:    s_load_dword s6, s[0:1], 0x44
744; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
745; GFX10W32-NEXT:    v_mov_b32_e32 v0, s5
746; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX10W32-NEXT:    v_mov_b32_e32 v4, s6
748; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
749; GFX10W32-NEXT:  .LBB3_2:
750; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
751; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
752; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
753; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
754; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
755; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
756; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
757; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
758; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
759; GFX10W32-NEXT:    s_endpgm
760entry:
761  %lane = call i32 @llvm.amdgcn.workitem.id.x()
762  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0)
763  store i32 %old, i32 addrspace(1)* %out
764  ret void
765}
766
767define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
768; GFX6-LABEL: add_i32_varying_offset:
769; GFX6:       ; %bb.0: ; %entry
770; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
771; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
772; GFX6-NEXT:    v_mov_b32_e32 v1, 1
773; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
774; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
775; GFX6-NEXT:    s_mov_b32 s3, 0xf000
776; GFX6-NEXT:    s_mov_b32 s2, -1
777; GFX6-NEXT:    s_waitcnt vmcnt(0)
778; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
779; GFX6-NEXT:    s_endpgm
780;
781; GFX8-LABEL: add_i32_varying_offset:
782; GFX8:       ; %bb.0: ; %entry
783; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
784; GFX8-NEXT:    v_mov_b32_e32 v2, 1
785; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
786; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
788; GFX8-NEXT:    v_mov_b32_e32 v0, s0
789; GFX8-NEXT:    v_mov_b32_e32 v1, s1
790; GFX8-NEXT:    s_waitcnt vmcnt(0)
791; GFX8-NEXT:    flat_store_dword v[0:1], v2
792; GFX8-NEXT:    s_endpgm
793;
794; GFX9-LABEL: add_i32_varying_offset:
795; GFX9:       ; %bb.0: ; %entry
796; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
797; GFX9-NEXT:    v_mov_b32_e32 v1, 1
798; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
799; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
800; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
801; GFX9-NEXT:    v_mov_b32_e32 v0, 0
802; GFX9-NEXT:    s_waitcnt vmcnt(0)
803; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
804; GFX9-NEXT:    s_endpgm
805;
806; GFX10-LABEL: add_i32_varying_offset:
807; GFX10:       ; %bb.0: ; %entry
808; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
809; GFX10-NEXT:    v_mov_b32_e32 v1, 1
810; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
811; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
812; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
813; GFX10-NEXT:    v_mov_b32_e32 v0, 0
814; GFX10-NEXT:    s_waitcnt vmcnt(0)
815; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
816; GFX10-NEXT:    s_endpgm
817entry:
818  %lane = call i32 @llvm.amdgcn.workitem.id.x()
819  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
820  store i32 %old, i32 addrspace(1)* %out
821  ret void
822}
823
824define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
825; GFX6-LABEL: sub_i32_constant:
826; GFX6:       ; %bb.0: ; %entry
827; GFX6-NEXT:    s_mov_b64 s[2:3], exec
828; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
829; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
830; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
831; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
832; GFX6-NEXT:    ; implicit-def: $vgpr1
833; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
834; GFX6-NEXT:    s_cbranch_execz .LBB5_2
835; GFX6-NEXT:  ; %bb.1:
836; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
837; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
838; GFX6-NEXT:    s_mul_i32 s0, s0, 5
839; GFX6-NEXT:    v_mov_b32_e32 v1, s0
840; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
842; GFX6-NEXT:  .LBB5_2:
843; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
844; GFX6-NEXT:    s_waitcnt vmcnt(0)
845; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
846; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
847; GFX6-NEXT:    s_mov_b32 s7, 0xf000
848; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
849; GFX6-NEXT:    s_mov_b32 s6, -1
850; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
851; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
852; GFX6-NEXT:    s_endpgm
853;
854; GFX8-LABEL: sub_i32_constant:
855; GFX8:       ; %bb.0: ; %entry
856; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
857; GFX8-NEXT:    s_mov_b64 s[6:7], exec
858; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
859; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
860; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
861; GFX8-NEXT:    ; implicit-def: $vgpr1
862; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
863; GFX8-NEXT:    s_cbranch_execz .LBB5_2
864; GFX8-NEXT:  ; %bb.1:
865; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
866; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
867; GFX8-NEXT:    s_mul_i32 s0, s0, 5
868; GFX8-NEXT:    v_mov_b32_e32 v1, s0
869; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
870; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
871; GFX8-NEXT:  .LBB5_2:
872; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
873; GFX8-NEXT:    s_waitcnt vmcnt(0)
874; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
875; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
876; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
877; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
878; GFX8-NEXT:    v_mov_b32_e32 v0, s2
879; GFX8-NEXT:    v_mov_b32_e32 v1, s3
880; GFX8-NEXT:    flat_store_dword v[0:1], v2
881; GFX8-NEXT:    s_endpgm
882;
883; GFX9-LABEL: sub_i32_constant:
884; GFX9:       ; %bb.0: ; %entry
885; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
886; GFX9-NEXT:    s_mov_b64 s[6:7], exec
887; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
888; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
889; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
890; GFX9-NEXT:    ; implicit-def: $vgpr1
891; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
892; GFX9-NEXT:    s_cbranch_execz .LBB5_2
893; GFX9-NEXT:  ; %bb.1:
894; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
895; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
896; GFX9-NEXT:    s_mul_i32 s0, s0, 5
897; GFX9-NEXT:    v_mov_b32_e32 v1, s0
898; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
899; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
900; GFX9-NEXT:  .LBB5_2:
901; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
902; GFX9-NEXT:    s_waitcnt vmcnt(0)
903; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
904; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
905; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
906; GFX9-NEXT:    v_mov_b32_e32 v1, 0
907; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
908; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
909; GFX9-NEXT:    s_endpgm
910;
911; GFX10W64-LABEL: sub_i32_constant:
912; GFX10W64:       ; %bb.0: ; %entry
913; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
914; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
915; GFX10W64-NEXT:    ; implicit-def: $vgpr1
916; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
917; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
918; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
919; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
920; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
921; GFX10W64-NEXT:  ; %bb.1:
922; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
923; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
924; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
925; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
926; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
927; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
928; GFX10W64-NEXT:  .LBB5_2:
929; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
930; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
931; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
932; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
933; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
934; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
935; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
936; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
937; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
938; GFX10W64-NEXT:    s_endpgm
939;
940; GFX10W32-LABEL: sub_i32_constant:
941; GFX10W32:       ; %bb.0: ; %entry
942; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
943; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
944; GFX10W32-NEXT:    ; implicit-def: $vgpr1
945; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
946; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
947; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
948; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
949; GFX10W32-NEXT:  ; %bb.1:
950; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
951; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
952; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
953; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
954; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
955; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
956; GFX10W32-NEXT:  .LBB5_2:
957; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
958; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
959; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
960; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
961; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
962; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
963; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
964; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
965; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
966; GFX10W32-NEXT:    s_endpgm
967entry:
968  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
969  store i32 %old, i32 addrspace(1)* %out
970  ret void
971}
972
973define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
974; GFX6-LABEL: sub_i32_uniform:
975; GFX6:       ; %bb.0: ; %entry
976; GFX6-NEXT:    s_mov_b64 s[2:3], exec
977; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
978; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
979; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
980; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
981; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
982; GFX6-NEXT:    ; implicit-def: $vgpr1
983; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
984; GFX6-NEXT:    s_cbranch_execz .LBB6_2
985; GFX6-NEXT:  ; %bb.1:
986; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
987; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
988; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
989; GFX6-NEXT:    s_mul_i32 s0, s8, s0
990; GFX6-NEXT:    v_mov_b32_e32 v1, s0
991; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
992; GFX6-NEXT:  .LBB6_2:
993; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
994; GFX6-NEXT:    s_waitcnt vmcnt(0)
995; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
996; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
997; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
998; GFX6-NEXT:    s_mov_b32 s7, 0xf000
999; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1000; GFX6-NEXT:    s_mov_b32 s6, -1
1001; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1002; GFX6-NEXT:    s_endpgm
1003;
1004; GFX8-LABEL: sub_i32_uniform:
1005; GFX8:       ; %bb.0: ; %entry
1006; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1007; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
1008; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1009; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1010; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1011; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1012; GFX8-NEXT:    ; implicit-def: $vgpr1
1013; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1014; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1015; GFX8-NEXT:  ; %bb.1:
1016; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1017; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1018; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1019; GFX8-NEXT:    s_mul_i32 s0, s8, s0
1020; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1021; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1022; GFX8-NEXT:  .LBB6_2:
1023; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1024; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1026; GFX8-NEXT:    s_waitcnt vmcnt(0)
1027; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1028; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1029; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1030; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1031; GFX8-NEXT:    flat_store_dword v[0:1], v2
1032; GFX8-NEXT:    s_endpgm
1033;
1034; GFX9-LABEL: sub_i32_uniform:
1035; GFX9:       ; %bb.0: ; %entry
1036; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1037; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
1038; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1039; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1040; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1041; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1042; GFX9-NEXT:    ; implicit-def: $vgpr1
1043; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1044; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1045; GFX9-NEXT:  ; %bb.1:
1046; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1047; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1048; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX9-NEXT:    s_mul_i32 s0, s8, s0
1050; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1051; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1052; GFX9-NEXT:  .LBB6_2:
1053; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1054; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1055; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1056; GFX9-NEXT:    s_waitcnt vmcnt(0)
1057; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1058; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1059; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1060; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1061; GFX9-NEXT:    s_endpgm
1062;
1063; GFX10W64-LABEL: sub_i32_uniform:
1064; GFX10W64:       ; %bb.0: ; %entry
1065; GFX10W64-NEXT:    s_clause 0x1
1066; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1067; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
1068; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
1069; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1070; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1071; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1072; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1073; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1074; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1075; GFX10W64-NEXT:  ; %bb.1:
1076; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1077; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1078; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1079; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
1080; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1081; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1082; GFX10W64-NEXT:  .LBB6_2:
1083; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1084; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1085; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1086; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1087; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1088; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1089; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1090; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1091; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1092; GFX10W64-NEXT:    s_endpgm
1093;
1094; GFX10W32-LABEL: sub_i32_uniform:
1095; GFX10W32:       ; %bb.0: ; %entry
1096; GFX10W32-NEXT:    s_clause 0x1
1097; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1098; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
1099; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
1100; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1101; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1102; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1103; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1104; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1105; GFX10W32-NEXT:  ; %bb.1:
1106; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1107; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1108; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1109; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
1110; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1111; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1112; GFX10W32-NEXT:  .LBB6_2:
1113; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1114; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1115; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1117; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1118; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1119; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1120; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1121; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1122; GFX10W32-NEXT:    s_endpgm
1123entry:
1124  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
1125  store i32 %old, i32 addrspace(1)* %out
1126  ret void
1127}
1128
1129define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
1130; GFX6-LABEL: sub_i32_varying_vdata:
1131; GFX6:       ; %bb.0: ; %entry
1132; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1133; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1134; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1135; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1136; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1137; GFX6-NEXT:    s_mov_b32 s2, -1
1138; GFX6-NEXT:    s_waitcnt vmcnt(0)
1139; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1140; GFX6-NEXT:    s_endpgm
1141;
1142; GFX8-LABEL: sub_i32_varying_vdata:
1143; GFX8:       ; %bb.0: ; %entry
1144; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1145; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1146; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1147; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1148; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1149; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1150; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1151; GFX8-NEXT:    s_not_b64 exec, exec
1152; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1153; GFX8-NEXT:    s_not_b64 exec, exec
1154; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1155; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1156; GFX8-NEXT:    s_nop 1
1157; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1158; GFX8-NEXT:    s_nop 1
1159; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1160; GFX8-NEXT:    s_nop 1
1161; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1162; GFX8-NEXT:    s_nop 1
1163; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1164; GFX8-NEXT:    s_nop 1
1165; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1166; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1167; GFX8-NEXT:    s_nop 0
1168; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1169; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1170; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1171; GFX8-NEXT:    ; implicit-def: $vgpr0
1172; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1173; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1174; GFX8-NEXT:  ; %bb.1:
1175; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1176; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1177; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1178; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1179; GFX8-NEXT:  .LBB7_2:
1180; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1181; GFX8-NEXT:    s_waitcnt vmcnt(0)
1182; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1183; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1184; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1185; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1186; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1187; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1188; GFX8-NEXT:    flat_store_dword v[3:4], v0
1189; GFX8-NEXT:    s_endpgm
1190;
1191; GFX9-LABEL: sub_i32_varying_vdata:
1192; GFX9:       ; %bb.0: ; %entry
1193; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1194; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1195; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1196; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1197; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1198; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1199; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1200; GFX9-NEXT:    s_not_b64 exec, exec
1201; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1202; GFX9-NEXT:    s_not_b64 exec, exec
1203; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1204; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1205; GFX9-NEXT:    s_nop 1
1206; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1207; GFX9-NEXT:    s_nop 1
1208; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1209; GFX9-NEXT:    s_nop 1
1210; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1211; GFX9-NEXT:    s_nop 1
1212; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1213; GFX9-NEXT:    s_nop 1
1214; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1215; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1216; GFX9-NEXT:    s_nop 0
1217; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1218; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1219; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1220; GFX9-NEXT:    ; implicit-def: $vgpr0
1221; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1222; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1223; GFX9-NEXT:  ; %bb.1:
1224; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1225; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1226; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1227; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1228; GFX9-NEXT:  .LBB7_2:
1229; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1230; GFX9-NEXT:    s_waitcnt vmcnt(0)
1231; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1232; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1233; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1234; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1235; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1236; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1237; GFX9-NEXT:    s_endpgm
1238;
1239; GFX10W64-LABEL: sub_i32_varying_vdata:
1240; GFX10W64:       ; %bb.0: ; %entry
1241; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1242; GFX10W64-NEXT:    s_not_b64 exec, exec
1243; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1244; GFX10W64-NEXT:    s_not_b64 exec, exec
1245; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1246; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1247; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1248; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1249; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1250; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1251; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1252; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1253; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1254; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1255; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1256; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1257; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1258; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1259; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1260; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1261; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1262; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1263; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1264; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1265; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1266; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1267; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1268; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1269; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1270; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1271; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1272; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1273; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1274; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1275; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1276; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1277; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1278; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1279; GFX10W64-NEXT:  ; %bb.1:
1280; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1281; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1282; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1283; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1284; GFX10W64-NEXT:  .LBB7_2:
1285; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1286; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1287; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1288; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1289; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1290; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1291; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1292; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1294; GFX10W64-NEXT:    s_endpgm
1295;
1296; GFX10W32-LABEL: sub_i32_varying_vdata:
1297; GFX10W32:       ; %bb.0: ; %entry
1298; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1299; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1300; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1301; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1302; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1303; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1304; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1305; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1306; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1307; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1308; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1309; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1310; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1311; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1312; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1313; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1314; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1315; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1316; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1317; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1318; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1319; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1320; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1321; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1322; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1323; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1324; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1325; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1326; GFX10W32-NEXT:  ; %bb.1:
1327; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1328; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1329; GFX10W32-NEXT:    s_mov_b32 s5, s6
1330; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1331; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1332; GFX10W32-NEXT:  .LBB7_2:
1333; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1334; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1335; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1336; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1337; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1338; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1339; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1340; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1342; GFX10W32-NEXT:    s_endpgm
1343entry:
1344  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1345  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1346  store i32 %old, i32 addrspace(1)* %out
1347  ret void
1348}
1349
1350define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1351; GFX6-LABEL: sub_i32_varying_offset:
1352; GFX6:       ; %bb.0: ; %entry
1353; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1354; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1355; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1356; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1357; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1358; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1359; GFX6-NEXT:    s_mov_b32 s2, -1
1360; GFX6-NEXT:    s_waitcnt vmcnt(0)
1361; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1362; GFX6-NEXT:    s_endpgm
1363;
1364; GFX8-LABEL: sub_i32_varying_offset:
1365; GFX8:       ; %bb.0: ; %entry
1366; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1367; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1368; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1369; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1370; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
1371; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1372; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1373; GFX8-NEXT:    s_waitcnt vmcnt(0)
1374; GFX8-NEXT:    flat_store_dword v[0:1], v2
1375; GFX8-NEXT:    s_endpgm
1376;
1377; GFX9-LABEL: sub_i32_varying_offset:
1378; GFX9:       ; %bb.0: ; %entry
1379; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1380; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1381; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1382; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1383; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1384; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1385; GFX9-NEXT:    s_waitcnt vmcnt(0)
1386; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1387; GFX9-NEXT:    s_endpgm
1388;
1389; GFX10-LABEL: sub_i32_varying_offset:
1390; GFX10:       ; %bb.0: ; %entry
1391; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1392; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1393; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1394; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1395; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1396; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1397; GFX10-NEXT:    s_waitcnt vmcnt(0)
1398; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1399; GFX10-NEXT:    s_endpgm
1400entry:
1401  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1402  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1403  store i32 %old, i32 addrspace(1)* %out
1404  ret void
1405}
1406