1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg)
10declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32 immarg)
11declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32 immarg)
12
13; Show what the atomic optimization pass will do for raw buffers.
14
15define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
16; GFX6-LABEL: add_i32_constant:
17; GFX6:       ; %bb.0: ; %entry
18; GFX6-NEXT:    s_mov_b64 s[2:3], exec
19; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
20; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
21; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
22; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
23; GFX6-NEXT:    ; implicit-def: $vgpr1
24; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
25; GFX6-NEXT:    s_cbranch_execz .LBB0_2
26; GFX6-NEXT:  ; %bb.1:
27; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
28; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
29; GFX6-NEXT:    s_mul_i32 s0, s0, 5
30; GFX6-NEXT:    v_mov_b32_e32 v1, s0
31; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
32; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
33; GFX6-NEXT:  .LBB0_2:
34; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
35; GFX6-NEXT:    s_waitcnt vmcnt(0)
36; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
37; GFX6-NEXT:    s_mov_b32 s7, 0xf000
38; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
39; GFX6-NEXT:    s_mov_b32 s6, -1
40; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
41; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
42; GFX6-NEXT:    s_endpgm
43;
44; GFX8-LABEL: add_i32_constant:
45; GFX8:       ; %bb.0: ; %entry
46; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
47; GFX8-NEXT:    s_mov_b64 s[6:7], exec
48; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
49; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
50; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
51; GFX8-NEXT:    ; implicit-def: $vgpr1
52; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
53; GFX8-NEXT:    s_cbranch_execz .LBB0_2
54; GFX8-NEXT:  ; %bb.1:
55; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
56; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
57; GFX8-NEXT:    s_mul_i32 s0, s0, 5
58; GFX8-NEXT:    v_mov_b32_e32 v1, s0
59; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
61; GFX8-NEXT:  .LBB0_2:
62; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
63; GFX8-NEXT:    s_waitcnt vmcnt(0)
64; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
65; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
66; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
67; GFX8-NEXT:    v_mov_b32_e32 v0, s2
68; GFX8-NEXT:    v_mov_b32_e32 v1, s3
69; GFX8-NEXT:    flat_store_dword v[0:1], v2
70; GFX8-NEXT:    s_endpgm
71;
72; GFX9-LABEL: add_i32_constant:
73; GFX9:       ; %bb.0: ; %entry
74; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
75; GFX9-NEXT:    s_mov_b64 s[6:7], exec
76; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
77; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
78; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
79; GFX9-NEXT:    ; implicit-def: $vgpr1
80; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
81; GFX9-NEXT:    s_cbranch_execz .LBB0_2
82; GFX9-NEXT:  ; %bb.1:
83; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
84; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
85; GFX9-NEXT:    s_mul_i32 s0, s0, 5
86; GFX9-NEXT:    v_mov_b32_e32 v1, s0
87; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
88; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
89; GFX9-NEXT:  .LBB0_2:
90; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
91; GFX9-NEXT:    s_waitcnt vmcnt(0)
92; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
93; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
94; GFX9-NEXT:    v_mov_b32_e32 v1, 0
95; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
97; GFX9-NEXT:    s_endpgm
98;
99; GFX10W64-LABEL: add_i32_constant:
100; GFX10W64:       ; %bb.0: ; %entry
101; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
102; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
103; GFX10W64-NEXT:    ; implicit-def: $vgpr1
104; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
105; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
106; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
107; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
108; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
109; GFX10W64-NEXT:  ; %bb.1:
110; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
111; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
112; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
113; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
114; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
116; GFX10W64-NEXT:  .LBB0_2:
117; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
118; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
119; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
120; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
121; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
122; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
123; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
124; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
125; GFX10W64-NEXT:    s_endpgm
126;
127; GFX10W32-LABEL: add_i32_constant:
128; GFX10W32:       ; %bb.0: ; %entry
129; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
130; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
131; GFX10W32-NEXT:    ; implicit-def: $vgpr1
132; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
133; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
134; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
135; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
136; GFX10W32-NEXT:  ; %bb.1:
137; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
138; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
139; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
140; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
141; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
142; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
143; GFX10W32-NEXT:  .LBB0_2:
144; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
145; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
146; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
147; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
148; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
149; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
150; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
151; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
152; GFX10W32-NEXT:    s_endpgm
153entry:
154  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
155  store i32 %old, i32 addrspace(1)* %out
156  ret void
157}
158
159define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
160; GFX6-LABEL: add_i32_uniform:
161; GFX6:       ; %bb.0: ; %entry
162; GFX6-NEXT:    s_mov_b64 s[2:3], exec
163; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
164; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
165; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
166; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
167; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
168; GFX6-NEXT:    ; implicit-def: $vgpr1
169; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
170; GFX6-NEXT:    s_cbranch_execz .LBB1_2
171; GFX6-NEXT:  ; %bb.1:
172; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
173; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
174; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX6-NEXT:    s_mul_i32 s0, s8, s0
176; GFX6-NEXT:    v_mov_b32_e32 v1, s0
177; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
178; GFX6-NEXT:  .LBB1_2:
179; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
180; GFX6-NEXT:    s_waitcnt vmcnt(0)
181; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
182; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
184; GFX6-NEXT:    s_mov_b32 s7, 0xf000
185; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
186; GFX6-NEXT:    s_mov_b32 s6, -1
187; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
188; GFX6-NEXT:    s_endpgm
189;
190; GFX8-LABEL: add_i32_uniform:
191; GFX8:       ; %bb.0: ; %entry
192; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
193; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
194; GFX8-NEXT:    s_mov_b64 s[4:5], exec
195; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
196; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
197; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
198; GFX8-NEXT:    ; implicit-def: $vgpr1
199; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
200; GFX8-NEXT:    s_cbranch_execz .LBB1_2
201; GFX8-NEXT:  ; %bb.1:
202; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
203; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
204; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX8-NEXT:    s_mul_i32 s0, s8, s0
206; GFX8-NEXT:    v_mov_b32_e32 v1, s0
207; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
208; GFX8-NEXT:  .LBB1_2:
209; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
210; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
211; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
212; GFX8-NEXT:    s_waitcnt vmcnt(0)
213; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
214; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
215; GFX8-NEXT:    v_mov_b32_e32 v0, s2
216; GFX8-NEXT:    v_mov_b32_e32 v1, s3
217; GFX8-NEXT:    flat_store_dword v[0:1], v2
218; GFX8-NEXT:    s_endpgm
219;
220; GFX9-LABEL: add_i32_uniform:
221; GFX9:       ; %bb.0: ; %entry
222; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
223; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
224; GFX9-NEXT:    s_mov_b64 s[4:5], exec
225; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
226; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
227; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
228; GFX9-NEXT:    ; implicit-def: $vgpr1
229; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
230; GFX9-NEXT:    s_cbranch_execz .LBB1_2
231; GFX9-NEXT:  ; %bb.1:
232; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
233; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
234; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
235; GFX9-NEXT:    s_mul_i32 s0, s8, s0
236; GFX9-NEXT:    v_mov_b32_e32 v1, s0
237; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
238; GFX9-NEXT:  .LBB1_2:
239; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
240; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
241; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
242; GFX9-NEXT:    s_waitcnt vmcnt(0)
243; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
244; GFX9-NEXT:    v_mov_b32_e32 v1, 0
245; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
246; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
247; GFX9-NEXT:    s_endpgm
248;
249; GFX10W64-LABEL: add_i32_uniform:
250; GFX10W64:       ; %bb.0: ; %entry
251; GFX10W64-NEXT:    s_clause 0x1
252; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
253; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
254; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
255; GFX10W64-NEXT:    ; implicit-def: $vgpr1
256; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
257; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
258; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
259; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
260; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
261; GFX10W64-NEXT:  ; %bb.1:
262; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
263; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
264; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
266; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
267; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
268; GFX10W64-NEXT:  .LBB1_2:
269; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
270; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
271; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
272; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
273; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
274; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
275; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
276; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
277; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
278; GFX10W64-NEXT:    s_endpgm
279;
280; GFX10W32-LABEL: add_i32_uniform:
281; GFX10W32:       ; %bb.0: ; %entry
282; GFX10W32-NEXT:    s_clause 0x1
283; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
284; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
285; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
286; GFX10W32-NEXT:    ; implicit-def: $vgpr1
287; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
288; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
289; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
290; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
291; GFX10W32-NEXT:  ; %bb.1:
292; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
293; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
294; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
295; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
296; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
297; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
298; GFX10W32-NEXT:  .LBB1_2:
299; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
300; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
301; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
303; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
304; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
305; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
306; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
307; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
308; GFX10W32-NEXT:    s_endpgm
309entry:
310  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
311  store i32 %old, i32 addrspace(1)* %out
312  ret void
313}
314
315define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
316; GFX6-LABEL: add_i32_varying_vdata:
317; GFX6:       ; %bb.0: ; %entry
318; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
319; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
320; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
322; GFX6-NEXT:    s_mov_b32 s3, 0xf000
323; GFX6-NEXT:    s_mov_b32 s2, -1
324; GFX6-NEXT:    s_waitcnt vmcnt(0)
325; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
326; GFX6-NEXT:    s_endpgm
327;
328; GFX8-LABEL: add_i32_varying_vdata:
329; GFX8:       ; %bb.0: ; %entry
330; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
331; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
332; GFX8-NEXT:    v_mov_b32_e32 v1, 0
333; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
334; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
335; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
336; GFX8-NEXT:    v_mov_b32_e32 v2, v0
337; GFX8-NEXT:    s_not_b64 exec, exec
338; GFX8-NEXT:    v_mov_b32_e32 v2, 0
339; GFX8-NEXT:    s_not_b64 exec, exec
340; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
341; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
342; GFX8-NEXT:    s_nop 1
343; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
344; GFX8-NEXT:    s_nop 1
345; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
346; GFX8-NEXT:    s_nop 1
347; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
348; GFX8-NEXT:    s_nop 1
349; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
350; GFX8-NEXT:    s_nop 1
351; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
352; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
353; GFX8-NEXT:    s_nop 0
354; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
355; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
356; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
357; GFX8-NEXT:    ; implicit-def: $vgpr0
358; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
359; GFX8-NEXT:    s_cbranch_execz .LBB2_2
360; GFX8-NEXT:  ; %bb.1:
361; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
362; GFX8-NEXT:    v_mov_b32_e32 v0, s6
363; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
364; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
365; GFX8-NEXT:  .LBB2_2:
366; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
367; GFX8-NEXT:    s_waitcnt vmcnt(0)
368; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
369; GFX8-NEXT:    v_mov_b32_e32 v0, v1
370; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
371; GFX8-NEXT:    v_mov_b32_e32 v4, s3
372; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
373; GFX8-NEXT:    v_mov_b32_e32 v3, s2
374; GFX8-NEXT:    flat_store_dword v[3:4], v0
375; GFX8-NEXT:    s_endpgm
376;
377; GFX9-LABEL: add_i32_varying_vdata:
378; GFX9:       ; %bb.0: ; %entry
379; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
380; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
381; GFX9-NEXT:    v_mov_b32_e32 v1, 0
382; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
383; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
384; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
385; GFX9-NEXT:    v_mov_b32_e32 v2, v0
386; GFX9-NEXT:    s_not_b64 exec, exec
387; GFX9-NEXT:    v_mov_b32_e32 v2, 0
388; GFX9-NEXT:    s_not_b64 exec, exec
389; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
390; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
391; GFX9-NEXT:    s_nop 1
392; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
393; GFX9-NEXT:    s_nop 1
394; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
395; GFX9-NEXT:    s_nop 1
396; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
397; GFX9-NEXT:    s_nop 1
398; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
399; GFX9-NEXT:    s_nop 1
400; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
401; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
402; GFX9-NEXT:    s_nop 0
403; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
404; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
405; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
406; GFX9-NEXT:    ; implicit-def: $vgpr0
407; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
408; GFX9-NEXT:    s_cbranch_execz .LBB2_2
409; GFX9-NEXT:  ; %bb.1:
410; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
411; GFX9-NEXT:    v_mov_b32_e32 v0, s6
412; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
413; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
414; GFX9-NEXT:  .LBB2_2:
415; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
416; GFX9-NEXT:    s_waitcnt vmcnt(0)
417; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
418; GFX9-NEXT:    v_mov_b32_e32 v0, v1
419; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
420; GFX9-NEXT:    v_mov_b32_e32 v3, 0
421; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
422; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
423; GFX9-NEXT:    s_endpgm
424;
425; GFX10W64-LABEL: add_i32_varying_vdata:
426; GFX10W64:       ; %bb.0: ; %entry
427; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
428; GFX10W64-NEXT:    s_not_b64 exec, exec
429; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
430; GFX10W64-NEXT:    s_not_b64 exec, exec
431; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
432; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
434; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
435; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
436; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
437; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
438; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
439; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
440; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
441; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
442; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
443; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
444; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
445; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
446; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
447; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
448; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
449; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
450; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
451; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
452; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
453; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
454; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
455; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
456; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
457; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
458; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
459; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
460; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
461; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
462; GFX10W64-NEXT:    ; implicit-def: $vgpr0
463; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
464; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
465; GFX10W64-NEXT:  ; %bb.1:
466; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
467; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
468; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
469; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
470; GFX10W64-NEXT:  .LBB2_2:
471; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
472; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
473; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
474; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
475; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
476; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
477; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
478; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
480; GFX10W64-NEXT:    s_endpgm
481;
482; GFX10W32-LABEL: add_i32_varying_vdata:
483; GFX10W32:       ; %bb.0: ; %entry
484; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
485; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
486; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
487; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
488; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
489; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
490; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
491; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
492; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
493; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
494; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
495; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
496; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
497; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
498; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
499; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
500; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
501; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
502; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
503; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
504; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
505; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
506; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
507; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
508; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
509; GFX10W32-NEXT:    ; implicit-def: $vgpr0
510; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
511; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
512; GFX10W32-NEXT:  ; %bb.1:
513; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
514; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
515; GFX10W32-NEXT:    s_mov_b32 s5, s6
516; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
517; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
518; GFX10W32-NEXT:  .LBB2_2:
519; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
520; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
521; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
522; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
523; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
524; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
525; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
526; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
527; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
528; GFX10W32-NEXT:    s_endpgm
529entry:
530  %lane = call i32 @llvm.amdgcn.workitem.id.x()
531  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
532  store i32 %old, i32 addrspace(1)* %out
533  ret void
534}
535
536define amdgpu_kernel void @struct_add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %vindex) {
537; GFX6-LABEL: struct_add_i32_varying_vdata:
538; GFX6:       ; %bb.0: ; %entry
539; GFX6-NEXT:    s_load_dword s2, s[0:1], 0x11
540; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
541; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
542; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX6-NEXT:    v_mov_b32_e32 v1, s2
544; GFX6-NEXT:    buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
545; GFX6-NEXT:    s_mov_b32 s3, 0xf000
546; GFX6-NEXT:    s_mov_b32 s2, -1
547; GFX6-NEXT:    s_waitcnt vmcnt(0)
548; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
549; GFX6-NEXT:    s_endpgm
550;
551; GFX8-LABEL: struct_add_i32_varying_vdata:
552; GFX8:       ; %bb.0: ; %entry
553; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
554; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
555; GFX8-NEXT:    v_mov_b32_e32 v1, 0
556; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
557; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
558; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
559; GFX8-NEXT:    v_mov_b32_e32 v2, v0
560; GFX8-NEXT:    s_not_b64 exec, exec
561; GFX8-NEXT:    v_mov_b32_e32 v2, 0
562; GFX8-NEXT:    s_not_b64 exec, exec
563; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
564; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
565; GFX8-NEXT:    s_nop 1
566; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
567; GFX8-NEXT:    s_nop 1
568; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
569; GFX8-NEXT:    s_nop 1
570; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
571; GFX8-NEXT:    s_nop 1
572; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
573; GFX8-NEXT:    s_nop 1
574; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
575; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
576; GFX8-NEXT:    s_nop 0
577; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
578; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
579; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
580; GFX8-NEXT:    ; implicit-def: $vgpr0
581; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
582; GFX8-NEXT:    s_cbranch_execz .LBB3_2
583; GFX8-NEXT:  ; %bb.1:
584; GFX8-NEXT:    s_load_dword s7, s[0:1], 0x44
585; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
586; GFX8-NEXT:    v_mov_b32_e32 v0, s6
587; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
588; GFX8-NEXT:    v_mov_b32_e32 v3, s7
589; GFX8-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
590; GFX8-NEXT:  .LBB3_2:
591; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
592; GFX8-NEXT:    s_waitcnt vmcnt(0)
593; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
594; GFX8-NEXT:    v_mov_b32_e32 v0, v1
595; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
596; GFX8-NEXT:    v_mov_b32_e32 v4, s3
597; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
598; GFX8-NEXT:    v_mov_b32_e32 v3, s2
599; GFX8-NEXT:    flat_store_dword v[3:4], v0
600; GFX8-NEXT:    s_endpgm
601;
602; GFX9-LABEL: struct_add_i32_varying_vdata:
603; GFX9:       ; %bb.0: ; %entry
604; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
605; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
606; GFX9-NEXT:    v_mov_b32_e32 v1, 0
607; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
608; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
609; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
610; GFX9-NEXT:    v_mov_b32_e32 v2, v0
611; GFX9-NEXT:    s_not_b64 exec, exec
612; GFX9-NEXT:    v_mov_b32_e32 v2, 0
613; GFX9-NEXT:    s_not_b64 exec, exec
614; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
615; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
616; GFX9-NEXT:    s_nop 1
617; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
618; GFX9-NEXT:    s_nop 1
619; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
620; GFX9-NEXT:    s_nop 1
621; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
622; GFX9-NEXT:    s_nop 1
623; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
624; GFX9-NEXT:    s_nop 1
625; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
626; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
627; GFX9-NEXT:    s_nop 0
628; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
629; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
630; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
631; GFX9-NEXT:    ; implicit-def: $vgpr0
632; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
633; GFX9-NEXT:    s_cbranch_execz .LBB3_2
634; GFX9-NEXT:  ; %bb.1:
635; GFX9-NEXT:    s_load_dword s7, s[0:1], 0x44
636; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
637; GFX9-NEXT:    v_mov_b32_e32 v0, s6
638; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
639; GFX9-NEXT:    v_mov_b32_e32 v3, s7
640; GFX9-NEXT:    buffer_atomic_add v0, v3, s[8:11], 0 idxen glc
641; GFX9-NEXT:  .LBB3_2:
642; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
643; GFX9-NEXT:    s_waitcnt vmcnt(0)
644; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
645; GFX9-NEXT:    v_mov_b32_e32 v0, v1
646; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
647; GFX9-NEXT:    v_mov_b32_e32 v3, 0
648; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
650; GFX9-NEXT:    s_endpgm
651;
652; GFX10W64-LABEL: struct_add_i32_varying_vdata:
653; GFX10W64:       ; %bb.0: ; %entry
654; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
655; GFX10W64-NEXT:    s_not_b64 exec, exec
656; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
657; GFX10W64-NEXT:    s_not_b64 exec, exec
658; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
659; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
660; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
661; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
662; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
663; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
664; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
665; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
666; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
667; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
668; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
669; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
670; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
671; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
672; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
673; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
674; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
675; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
676; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
677; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
678; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
679; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
680; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
681; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
682; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
683; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
684; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
685; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
686; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
687; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
688; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
689; GFX10W64-NEXT:    ; implicit-def: $vgpr0
690; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
691; GFX10W64-NEXT:    s_cbranch_execz .LBB3_2
692; GFX10W64-NEXT:  ; %bb.1:
693; GFX10W64-NEXT:    s_clause 0x1
694; GFX10W64-NEXT:    s_load_dword s7, s[0:1], 0x44
695; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
696; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
697; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX10W64-NEXT:    v_mov_b32_e32 v4, s7
699; GFX10W64-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
700; GFX10W64-NEXT:  .LBB3_2:
701; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
702; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
703; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
704; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
705; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
706; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
707; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
708; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
709; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
710; GFX10W64-NEXT:    s_endpgm
711;
712; GFX10W32-LABEL: struct_add_i32_varying_vdata:
713; GFX10W32:       ; %bb.0: ; %entry
714; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
715; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
716; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
717; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
718; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
719; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
720; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
721; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
722; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
723; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
724; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
725; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
726; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
727; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
728; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
729; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
730; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
731; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
732; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
733; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
734; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
735; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
736; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
737; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
738; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
739; GFX10W32-NEXT:    ; implicit-def: $vgpr0
740; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
741; GFX10W32-NEXT:    s_cbranch_execz .LBB3_2
742; GFX10W32-NEXT:  ; %bb.1:
743; GFX10W32-NEXT:    s_mov_b32 s5, s6
744; GFX10W32-NEXT:    s_clause 0x1
745; GFX10W32-NEXT:    s_load_dword s6, s[0:1], 0x44
746; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
747; GFX10W32-NEXT:    v_mov_b32_e32 v0, s5
748; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
749; GFX10W32-NEXT:    v_mov_b32_e32 v4, s6
750; GFX10W32-NEXT:    buffer_atomic_add v0, v4, s[8:11], 0 idxen glc
751; GFX10W32-NEXT:  .LBB3_2:
752; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
753; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
754; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
755; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
756; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
757; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
758; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
759; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
760; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
761; GFX10W32-NEXT:    s_endpgm
762entry:
763  %lane = call i32 @llvm.amdgcn.workitem.id.x()
764  %old = call i32 @llvm.amdgcn.struct.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 %vindex, i32 0, i32 0, i32 0)
765  store i32 %old, i32 addrspace(1)* %out
766  ret void
767}
768
769define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
770; GFX6-LABEL: add_i32_varying_offset:
771; GFX6:       ; %bb.0: ; %entry
772; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
773; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
774; GFX6-NEXT:    v_mov_b32_e32 v1, 1
775; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
776; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
777; GFX6-NEXT:    s_mov_b32 s3, 0xf000
778; GFX6-NEXT:    s_mov_b32 s2, -1
779; GFX6-NEXT:    s_waitcnt vmcnt(0)
780; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
781; GFX6-NEXT:    s_endpgm
782;
783; GFX8-LABEL: add_i32_varying_offset:
784; GFX8:       ; %bb.0: ; %entry
785; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
786; GFX8-NEXT:    v_mov_b32_e32 v2, 1
787; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
788; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
789; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
790; GFX8-NEXT:    v_mov_b32_e32 v0, s0
791; GFX8-NEXT:    v_mov_b32_e32 v1, s1
792; GFX8-NEXT:    s_waitcnt vmcnt(0)
793; GFX8-NEXT:    flat_store_dword v[0:1], v2
794; GFX8-NEXT:    s_endpgm
795;
796; GFX9-LABEL: add_i32_varying_offset:
797; GFX9:       ; %bb.0: ; %entry
798; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
799; GFX9-NEXT:    v_mov_b32_e32 v1, 1
800; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
801; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
802; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
803; GFX9-NEXT:    v_mov_b32_e32 v0, 0
804; GFX9-NEXT:    s_waitcnt vmcnt(0)
805; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
806; GFX9-NEXT:    s_endpgm
807;
808; GFX10-LABEL: add_i32_varying_offset:
809; GFX10:       ; %bb.0: ; %entry
810; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
811; GFX10-NEXT:    v_mov_b32_e32 v1, 1
812; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
813; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
814; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
815; GFX10-NEXT:    v_mov_b32_e32 v0, 0
816; GFX10-NEXT:    s_waitcnt vmcnt(0)
817; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
818; GFX10-NEXT:    s_endpgm
819entry:
820  %lane = call i32 @llvm.amdgcn.workitem.id.x()
821  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
822  store i32 %old, i32 addrspace(1)* %out
823  ret void
824}
825
826define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
827; GFX6-LABEL: sub_i32_constant:
828; GFX6:       ; %bb.0: ; %entry
829; GFX6-NEXT:    s_mov_b64 s[2:3], exec
830; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
831; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
832; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
833; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
834; GFX6-NEXT:    ; implicit-def: $vgpr1
835; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
836; GFX6-NEXT:    s_cbranch_execz .LBB5_2
837; GFX6-NEXT:  ; %bb.1:
838; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
839; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
840; GFX6-NEXT:    s_mul_i32 s0, s0, 5
841; GFX6-NEXT:    v_mov_b32_e32 v1, s0
842; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
843; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
844; GFX6-NEXT:  .LBB5_2:
845; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
846; GFX6-NEXT:    s_waitcnt vmcnt(0)
847; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
848; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
849; GFX6-NEXT:    s_mov_b32 s7, 0xf000
850; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
851; GFX6-NEXT:    s_mov_b32 s6, -1
852; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
853; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
854; GFX6-NEXT:    s_endpgm
855;
856; GFX8-LABEL: sub_i32_constant:
857; GFX8:       ; %bb.0: ; %entry
858; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
859; GFX8-NEXT:    s_mov_b64 s[6:7], exec
860; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
861; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
862; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
863; GFX8-NEXT:    ; implicit-def: $vgpr1
864; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
865; GFX8-NEXT:    s_cbranch_execz .LBB5_2
866; GFX8-NEXT:  ; %bb.1:
867; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
868; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
869; GFX8-NEXT:    s_mul_i32 s0, s0, 5
870; GFX8-NEXT:    v_mov_b32_e32 v1, s0
871; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
872; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
873; GFX8-NEXT:  .LBB5_2:
874; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
875; GFX8-NEXT:    s_waitcnt vmcnt(0)
876; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
877; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
878; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
879; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX8-NEXT:    v_mov_b32_e32 v0, s2
881; GFX8-NEXT:    v_mov_b32_e32 v1, s3
882; GFX8-NEXT:    flat_store_dword v[0:1], v2
883; GFX8-NEXT:    s_endpgm
884;
885; GFX9-LABEL: sub_i32_constant:
886; GFX9:       ; %bb.0: ; %entry
887; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
888; GFX9-NEXT:    s_mov_b64 s[6:7], exec
889; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
890; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
891; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
892; GFX9-NEXT:    ; implicit-def: $vgpr1
893; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
894; GFX9-NEXT:    s_cbranch_execz .LBB5_2
895; GFX9-NEXT:  ; %bb.1:
896; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
897; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
898; GFX9-NEXT:    s_mul_i32 s0, s0, 5
899; GFX9-NEXT:    v_mov_b32_e32 v1, s0
900; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
902; GFX9-NEXT:  .LBB5_2:
903; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
904; GFX9-NEXT:    s_waitcnt vmcnt(0)
905; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
906; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
907; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
908; GFX9-NEXT:    v_mov_b32_e32 v1, 0
909; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
910; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
911; GFX9-NEXT:    s_endpgm
912;
913; GFX10W64-LABEL: sub_i32_constant:
914; GFX10W64:       ; %bb.0: ; %entry
915; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
916; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
917; GFX10W64-NEXT:    ; implicit-def: $vgpr1
918; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
919; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
920; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
921; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
922; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
923; GFX10W64-NEXT:  ; %bb.1:
924; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
925; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
926; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
927; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
928; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
929; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
930; GFX10W64-NEXT:  .LBB5_2:
931; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
932; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
933; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
934; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
935; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
936; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
937; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
938; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
939; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
940; GFX10W64-NEXT:    s_endpgm
941;
942; GFX10W32-LABEL: sub_i32_constant:
943; GFX10W32:       ; %bb.0: ; %entry
944; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
945; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
946; GFX10W32-NEXT:    ; implicit-def: $vgpr1
947; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
948; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
949; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
950; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
951; GFX10W32-NEXT:  ; %bb.1:
952; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
953; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
954; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
955; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
956; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
957; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
958; GFX10W32-NEXT:  .LBB5_2:
959; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
960; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
961; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
962; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
963; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
964; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
965; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
966; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
967; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
968; GFX10W32-NEXT:    s_endpgm
969entry:
970  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
971  store i32 %old, i32 addrspace(1)* %out
972  ret void
973}
974
975define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
976; GFX6-LABEL: sub_i32_uniform:
977; GFX6:       ; %bb.0: ; %entry
978; GFX6-NEXT:    s_mov_b64 s[2:3], exec
979; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
980; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
981; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
982; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
983; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
984; GFX6-NEXT:    ; implicit-def: $vgpr1
985; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
986; GFX6-NEXT:    s_cbranch_execz .LBB6_2
987; GFX6-NEXT:  ; %bb.1:
988; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
989; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
990; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
991; GFX6-NEXT:    s_mul_i32 s0, s8, s0
992; GFX6-NEXT:    v_mov_b32_e32 v1, s0
993; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
994; GFX6-NEXT:  .LBB6_2:
995; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
996; GFX6-NEXT:    s_waitcnt vmcnt(0)
997; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
998; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
999; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
1000; GFX6-NEXT:    s_mov_b32 s7, 0xf000
1001; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
1002; GFX6-NEXT:    s_mov_b32 s6, -1
1003; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1004; GFX6-NEXT:    s_endpgm
1005;
1006; GFX8-LABEL: sub_i32_uniform:
1007; GFX8:       ; %bb.0: ; %entry
1008; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1009; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
1010; GFX8-NEXT:    s_mov_b64 s[4:5], exec
1011; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1012; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1013; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1014; GFX8-NEXT:    ; implicit-def: $vgpr1
1015; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1016; GFX8-NEXT:    s_cbranch_execz .LBB6_2
1017; GFX8-NEXT:  ; %bb.1:
1018; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1019; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1020; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1021; GFX8-NEXT:    s_mul_i32 s0, s8, s0
1022; GFX8-NEXT:    v_mov_b32_e32 v1, s0
1023; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1024; GFX8-NEXT:  .LBB6_2:
1025; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
1026; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1027; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
1028; GFX8-NEXT:    s_waitcnt vmcnt(0)
1029; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
1030; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
1031; GFX8-NEXT:    v_mov_b32_e32 v0, s2
1032; GFX8-NEXT:    v_mov_b32_e32 v1, s3
1033; GFX8-NEXT:    flat_store_dword v[0:1], v2
1034; GFX8-NEXT:    s_endpgm
1035;
1036; GFX9-LABEL: sub_i32_uniform:
1037; GFX9:       ; %bb.0: ; %entry
1038; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1039; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
1040; GFX9-NEXT:    s_mov_b64 s[4:5], exec
1041; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1042; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1043; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1044; GFX9-NEXT:    ; implicit-def: $vgpr1
1045; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1046; GFX9-NEXT:    s_cbranch_execz .LBB6_2
1047; GFX9-NEXT:  ; %bb.1:
1048; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1049; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1050; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1051; GFX9-NEXT:    s_mul_i32 s0, s8, s0
1052; GFX9-NEXT:    v_mov_b32_e32 v1, s0
1053; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1054; GFX9-NEXT:  .LBB6_2:
1055; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
1056; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1057; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
1058; GFX9-NEXT:    s_waitcnt vmcnt(0)
1059; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
1060; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1061; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1062; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
1063; GFX9-NEXT:    s_endpgm
1064;
1065; GFX10W64-LABEL: sub_i32_uniform:
1066; GFX10W64:       ; %bb.0: ; %entry
1067; GFX10W64-NEXT:    s_clause 0x1
1068; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1069; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
1070; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
1071; GFX10W64-NEXT:    ; implicit-def: $vgpr1
1072; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
1073; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
1074; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1075; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
1076; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1077; GFX10W64-NEXT:  ; %bb.1:
1078; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
1079; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
1080; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1081; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
1082; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
1083; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
1084; GFX10W64-NEXT:  .LBB6_2:
1085; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1086; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
1087; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
1089; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1090; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
1091; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1092; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1093; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
1094; GFX10W64-NEXT:    s_endpgm
1095;
1096; GFX10W32-LABEL: sub_i32_uniform:
1097; GFX10W32:       ; %bb.0: ; %entry
1098; GFX10W32-NEXT:    s_clause 0x1
1099; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1100; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
1101; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
1102; GFX10W32-NEXT:    ; implicit-def: $vgpr1
1103; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
1104; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1105; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
1106; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1107; GFX10W32-NEXT:  ; %bb.1:
1108; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1109; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
1110; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1111; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
1112; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
1113; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
1114; GFX10W32-NEXT:  .LBB6_2:
1115; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1116; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
1117; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1118; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
1119; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1120; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
1121; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1122; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1123; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
1124; GFX10W32-NEXT:    s_endpgm
1125entry:
1126  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
1127  store i32 %old, i32 addrspace(1)* %out
1128  ret void
1129}
1130
1131define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
1132; GFX6-LABEL: sub_i32_varying_vdata:
1133; GFX6:       ; %bb.0: ; %entry
1134; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1135; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1136; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
1138; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1139; GFX6-NEXT:    s_mov_b32 s2, -1
1140; GFX6-NEXT:    s_waitcnt vmcnt(0)
1141; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1142; GFX6-NEXT:    s_endpgm
1143;
1144; GFX8-LABEL: sub_i32_varying_vdata:
1145; GFX8:       ; %bb.0: ; %entry
1146; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1147; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1148; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1149; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1150; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1151; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1152; GFX8-NEXT:    v_mov_b32_e32 v2, v0
1153; GFX8-NEXT:    s_not_b64 exec, exec
1154; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1155; GFX8-NEXT:    s_not_b64 exec, exec
1156; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
1157; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1158; GFX8-NEXT:    s_nop 1
1159; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1160; GFX8-NEXT:    s_nop 1
1161; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1162; GFX8-NEXT:    s_nop 1
1163; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1164; GFX8-NEXT:    s_nop 1
1165; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1166; GFX8-NEXT:    s_nop 1
1167; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1168; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
1169; GFX8-NEXT:    s_nop 0
1170; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1171; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
1172; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1173; GFX8-NEXT:    ; implicit-def: $vgpr0
1174; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1175; GFX8-NEXT:    s_cbranch_execz .LBB7_2
1176; GFX8-NEXT:  ; %bb.1:
1177; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1178; GFX8-NEXT:    v_mov_b32_e32 v0, s6
1179; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1180; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1181; GFX8-NEXT:  .LBB7_2:
1182; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
1183; GFX8-NEXT:    s_waitcnt vmcnt(0)
1184; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
1185; GFX8-NEXT:    v_mov_b32_e32 v0, v1
1186; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX8-NEXT:    v_mov_b32_e32 v4, s3
1188; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
1189; GFX8-NEXT:    v_mov_b32_e32 v3, s2
1190; GFX8-NEXT:    flat_store_dword v[3:4], v0
1191; GFX8-NEXT:    s_endpgm
1192;
1193; GFX9-LABEL: sub_i32_varying_vdata:
1194; GFX9:       ; %bb.0: ; %entry
1195; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1196; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1197; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1198; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1199; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
1200; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
1201; GFX9-NEXT:    v_mov_b32_e32 v2, v0
1202; GFX9-NEXT:    s_not_b64 exec, exec
1203; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1204; GFX9-NEXT:    s_not_b64 exec, exec
1205; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
1206; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1207; GFX9-NEXT:    s_nop 1
1208; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1209; GFX9-NEXT:    s_nop 1
1210; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1211; GFX9-NEXT:    s_nop 1
1212; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1213; GFX9-NEXT:    s_nop 1
1214; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
1215; GFX9-NEXT:    s_nop 1
1216; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
1217; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
1218; GFX9-NEXT:    s_nop 0
1219; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
1220; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
1221; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
1222; GFX9-NEXT:    ; implicit-def: $vgpr0
1223; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1224; GFX9-NEXT:    s_cbranch_execz .LBB7_2
1225; GFX9-NEXT:  ; %bb.1:
1226; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1227; GFX9-NEXT:    v_mov_b32_e32 v0, s6
1228; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1229; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1230; GFX9-NEXT:  .LBB7_2:
1231; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
1232; GFX9-NEXT:    s_waitcnt vmcnt(0)
1233; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
1234; GFX9-NEXT:    v_mov_b32_e32 v0, v1
1235; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1236; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1237; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1239; GFX9-NEXT:    s_endpgm
1240;
1241; GFX10W64-LABEL: sub_i32_varying_vdata:
1242; GFX10W64:       ; %bb.0: ; %entry
1243; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1244; GFX10W64-NEXT:    s_not_b64 exec, exec
1245; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1246; GFX10W64-NEXT:    s_not_b64 exec, exec
1247; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1248; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1249; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1250; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1251; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1252; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1253; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1254; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1255; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1256; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1257; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1258; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1259; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1260; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1261; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1262; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1263; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1264; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1265; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1266; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1267; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1268; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1269; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1270; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1271; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1272; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1273; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1274; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1275; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1276; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1277; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1278; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1279; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1280; GFX10W64-NEXT:    s_cbranch_execz .LBB7_2
1281; GFX10W64-NEXT:  ; %bb.1:
1282; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1283; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1284; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1285; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1286; GFX10W64-NEXT:  .LBB7_2:
1287; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1288; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1289; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1290; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1291; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1292; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1293; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1294; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1295; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1296; GFX10W64-NEXT:    s_endpgm
1297;
1298; GFX10W32-LABEL: sub_i32_varying_vdata:
1299; GFX10W32:       ; %bb.0: ; %entry
1300; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1301; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1302; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1303; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1304; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1305; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1306; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1307; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1308; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1309; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1310; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1311; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1312; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1313; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1314; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1315; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1316; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1317; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1318; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1319; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1320; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1321; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1322; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1323; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1324; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1325; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1326; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1327; GFX10W32-NEXT:    s_cbranch_execz .LBB7_2
1328; GFX10W32-NEXT:  ; %bb.1:
1329; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1330; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1331; GFX10W32-NEXT:    s_mov_b32 s5, s6
1332; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1333; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1334; GFX10W32-NEXT:  .LBB7_2:
1335; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1336; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1337; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1338; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1339; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1340; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1341; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1342; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1343; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1344; GFX10W32-NEXT:    s_endpgm
1345entry:
1346  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1347  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1348  store i32 %old, i32 addrspace(1)* %out
1349  ret void
1350}
1351
1352define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1353; GFX6-LABEL: sub_i32_varying_offset:
1354; GFX6:       ; %bb.0: ; %entry
1355; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1356; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1357; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1358; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1359; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1360; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1361; GFX6-NEXT:    s_mov_b32 s2, -1
1362; GFX6-NEXT:    s_waitcnt vmcnt(0)
1363; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1364; GFX6-NEXT:    s_endpgm
1365;
1366; GFX8-LABEL: sub_i32_varying_offset:
1367; GFX8:       ; %bb.0: ; %entry
1368; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1369; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1370; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1371; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1372; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
1373; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1374; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1375; GFX8-NEXT:    s_waitcnt vmcnt(0)
1376; GFX8-NEXT:    flat_store_dword v[0:1], v2
1377; GFX8-NEXT:    s_endpgm
1378;
1379; GFX9-LABEL: sub_i32_varying_offset:
1380; GFX9:       ; %bb.0: ; %entry
1381; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1382; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1383; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1384; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1385; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1386; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1387; GFX9-NEXT:    s_waitcnt vmcnt(0)
1388; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1389; GFX9-NEXT:    s_endpgm
1390;
1391; GFX10-LABEL: sub_i32_varying_offset:
1392; GFX10:       ; %bb.0: ; %entry
1393; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1394; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1395; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1396; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1397; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1398; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1399; GFX10-NEXT:    s_waitcnt vmcnt(0)
1400; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1401; GFX10-NEXT:    s_endpgm
1402entry:
1403  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1404  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1405  store i32 %old, i32 addrspace(1)* %out
1406  ret void
1407}
1408