1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX6 %s
3; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
4; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
5; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W64 %s
6; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX10W32 %s
7
8declare i32 @llvm.amdgcn.workitem.id.x()
9declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32)
10declare i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i32)
11
12; Show what the atomic optimization pass will do for raw buffers.
13
14define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
15; GFX6-LABEL: add_i32_constant:
16; GFX6:       ; %bb.0: ; %entry
17; GFX6-NEXT:    s_mov_b64 s[2:3], exec
18; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
19; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
20; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
21; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
22; GFX6-NEXT:    ; implicit-def: $vgpr1
23; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
24; GFX6-NEXT:    s_cbranch_execz .LBB0_2
25; GFX6-NEXT:  ; %bb.1:
26; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
27; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
28; GFX6-NEXT:    s_mul_i32 s0, s0, 5
29; GFX6-NEXT:    v_mov_b32_e32 v1, s0
30; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
31; GFX6-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
32; GFX6-NEXT:  .LBB0_2:
33; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
34; GFX6-NEXT:    s_waitcnt vmcnt(0)
35; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
36; GFX6-NEXT:    s_mov_b32 s7, 0xf000
37; GFX6-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
38; GFX6-NEXT:    s_mov_b32 s6, -1
39; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
41; GFX6-NEXT:    s_endpgm
42;
43; GFX8-LABEL: add_i32_constant:
44; GFX8:       ; %bb.0: ; %entry
45; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
46; GFX8-NEXT:    s_mov_b64 s[6:7], exec
47; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
48; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
49; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
50; GFX8-NEXT:    ; implicit-def: $vgpr1
51; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
52; GFX8-NEXT:    s_cbranch_execz .LBB0_2
53; GFX8-NEXT:  ; %bb.1:
54; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
55; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
56; GFX8-NEXT:    s_mul_i32 s0, s0, 5
57; GFX8-NEXT:    v_mov_b32_e32 v1, s0
58; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
59; GFX8-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
60; GFX8-NEXT:  .LBB0_2:
61; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
62; GFX8-NEXT:    s_waitcnt vmcnt(0)
63; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
64; GFX8-NEXT:    v_mad_u32_u24 v2, v0, 5, s0
65; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX8-NEXT:    v_mov_b32_e32 v0, s2
67; GFX8-NEXT:    v_mov_b32_e32 v1, s3
68; GFX8-NEXT:    flat_store_dword v[0:1], v2
69; GFX8-NEXT:    s_endpgm
70;
71; GFX9-LABEL: add_i32_constant:
72; GFX9:       ; %bb.0: ; %entry
73; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
74; GFX9-NEXT:    s_mov_b64 s[6:7], exec
75; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
76; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
77; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
78; GFX9-NEXT:    ; implicit-def: $vgpr1
79; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
80; GFX9-NEXT:    s_cbranch_execz .LBB0_2
81; GFX9-NEXT:  ; %bb.1:
82; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
83; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
84; GFX9-NEXT:    s_mul_i32 s0, s0, 5
85; GFX9-NEXT:    v_mov_b32_e32 v1, s0
86; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
87; GFX9-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
88; GFX9-NEXT:  .LBB0_2:
89; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
90; GFX9-NEXT:    s_waitcnt vmcnt(0)
91; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
92; GFX9-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
93; GFX9-NEXT:    v_mov_b32_e32 v1, 0
94; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
95; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
96; GFX9-NEXT:    s_endpgm
97;
98; GFX10W64-LABEL: add_i32_constant:
99; GFX10W64:       ; %bb.0: ; %entry
100; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
101; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
102; GFX10W64-NEXT:    ; implicit-def: $vgpr1
103; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
104; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
105; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
106; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
107; GFX10W64-NEXT:    s_cbranch_execz .LBB0_2
108; GFX10W64-NEXT:  ; %bb.1:
109; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
110; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
111; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
112; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
113; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
115; GFX10W64-NEXT:  .LBB0_2:
116; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
117; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
118; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
119; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
120; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
121; GFX10W64-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
122; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
123; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
124; GFX10W64-NEXT:    s_endpgm
125;
126; GFX10W32-LABEL: add_i32_constant:
127; GFX10W32:       ; %bb.0: ; %entry
128; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
129; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
130; GFX10W32-NEXT:    ; implicit-def: $vgpr1
131; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
132; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
133; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
134; GFX10W32-NEXT:    s_cbranch_execz .LBB0_2
135; GFX10W32-NEXT:  ; %bb.1:
136; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
137; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
138; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
139; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
140; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
142; GFX10W32-NEXT:  .LBB0_2:
143; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
144; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
145; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
146; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
147; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
148; GFX10W32-NEXT:    v_mad_u32_u24 v0, v0, 5, s0
149; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
150; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
151; GFX10W32-NEXT:    s_endpgm
152entry:
153  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
154  store i32 %old, i32 addrspace(1)* %out
155  ret void
156}
157
158define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %additive) {
159; GFX6-LABEL: add_i32_uniform:
160; GFX6:       ; %bb.0: ; %entry
161; GFX6-NEXT:    s_mov_b64 s[2:3], exec
162; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
163; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
164; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
165; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
166; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
167; GFX6-NEXT:    ; implicit-def: $vgpr1
168; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
169; GFX6-NEXT:    s_cbranch_execz .LBB1_2
170; GFX6-NEXT:  ; %bb.1:
171; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
172; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
173; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
174; GFX6-NEXT:    s_mul_i32 s0, s8, s0
175; GFX6-NEXT:    v_mov_b32_e32 v1, s0
176; GFX6-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
177; GFX6-NEXT:  .LBB1_2:
178; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
179; GFX6-NEXT:    s_waitcnt vmcnt(0)
180; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
181; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
183; GFX6-NEXT:    s_mov_b32 s7, 0xf000
184; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
185; GFX6-NEXT:    s_mov_b32 s6, -1
186; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
187; GFX6-NEXT:    s_endpgm
188;
189; GFX8-LABEL: add_i32_uniform:
190; GFX8:       ; %bb.0: ; %entry
191; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
192; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
193; GFX8-NEXT:    s_mov_b64 s[4:5], exec
194; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
195; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
196; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
197; GFX8-NEXT:    ; implicit-def: $vgpr1
198; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
199; GFX8-NEXT:    s_cbranch_execz .LBB1_2
200; GFX8-NEXT:  ; %bb.1:
201; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
202; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
203; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
204; GFX8-NEXT:    s_mul_i32 s0, s8, s0
205; GFX8-NEXT:    v_mov_b32_e32 v1, s0
206; GFX8-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
207; GFX8-NEXT:  .LBB1_2:
208; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
209; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
211; GFX8-NEXT:    s_waitcnt vmcnt(0)
212; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
213; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
214; GFX8-NEXT:    v_mov_b32_e32 v0, s2
215; GFX8-NEXT:    v_mov_b32_e32 v1, s3
216; GFX8-NEXT:    flat_store_dword v[0:1], v2
217; GFX8-NEXT:    s_endpgm
218;
219; GFX9-LABEL: add_i32_uniform:
220; GFX9:       ; %bb.0: ; %entry
221; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
222; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
223; GFX9-NEXT:    s_mov_b64 s[4:5], exec
224; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
225; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
226; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
227; GFX9-NEXT:    ; implicit-def: $vgpr1
228; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
229; GFX9-NEXT:    s_cbranch_execz .LBB1_2
230; GFX9-NEXT:  ; %bb.1:
231; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
232; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
233; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
234; GFX9-NEXT:    s_mul_i32 s0, s8, s0
235; GFX9-NEXT:    v_mov_b32_e32 v1, s0
236; GFX9-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
237; GFX9-NEXT:  .LBB1_2:
238; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
239; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
241; GFX9-NEXT:    s_waitcnt vmcnt(0)
242; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
243; GFX9-NEXT:    v_mov_b32_e32 v1, 0
244; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
245; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
246; GFX9-NEXT:    s_endpgm
247;
248; GFX10W64-LABEL: add_i32_uniform:
249; GFX10W64:       ; %bb.0: ; %entry
250; GFX10W64-NEXT:    s_clause 0x1
251; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
252; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
253; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
254; GFX10W64-NEXT:    ; implicit-def: $vgpr1
255; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
256; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
257; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
258; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
259; GFX10W64-NEXT:    s_cbranch_execz .LBB1_2
260; GFX10W64-NEXT:  ; %bb.1:
261; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
262; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
263; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
264; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
265; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
266; GFX10W64-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 glc
267; GFX10W64-NEXT:  .LBB1_2:
268; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
269; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
270; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
271; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
272; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
273; GFX10W64-NEXT:    v_mad_u64_u32 v[0:1], null, s8, v0, s[0:1]
274; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
275; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
276; GFX10W64-NEXT:    s_endpgm
277;
278; GFX10W32-LABEL: add_i32_uniform:
279; GFX10W32:       ; %bb.0: ; %entry
280; GFX10W32-NEXT:    s_clause 0x1
281; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
282; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
283; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
284; GFX10W32-NEXT:    ; implicit-def: $vgpr1
285; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
286; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
287; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
288; GFX10W32-NEXT:    s_cbranch_execz .LBB1_2
289; GFX10W32-NEXT:  ; %bb.1:
290; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
291; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
292; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
294; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
295; GFX10W32-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 glc
296; GFX10W32-NEXT:  .LBB1_2:
297; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
298; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
299; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
300; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
301; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX10W32-NEXT:    v_mad_u64_u32 v[0:1], null, s4, v0, s[0:1]
303; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
304; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
305; GFX10W32-NEXT:    s_endpgm
306entry:
307  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %additive, <4 x i32> %inout, i32 0, i32 0, i32 0)
308  store i32 %old, i32 addrspace(1)* %out
309  ret void
310}
311
312define amdgpu_kernel void @add_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
313; GFX6-LABEL: add_i32_varying_vdata:
314; GFX6:       ; %bb.0: ; %entry
315; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
316; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
317; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
318; GFX6-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
319; GFX6-NEXT:    s_mov_b32 s3, 0xf000
320; GFX6-NEXT:    s_mov_b32 s2, -1
321; GFX6-NEXT:    s_waitcnt vmcnt(0)
322; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
323; GFX6-NEXT:    s_endpgm
324;
325; GFX8-LABEL: add_i32_varying_vdata:
326; GFX8:       ; %bb.0: ; %entry
327; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
328; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
329; GFX8-NEXT:    v_mov_b32_e32 v1, 0
330; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
331; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
332; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
333; GFX8-NEXT:    v_mov_b32_e32 v2, v0
334; GFX8-NEXT:    s_not_b64 exec, exec
335; GFX8-NEXT:    v_mov_b32_e32 v2, 0
336; GFX8-NEXT:    s_not_b64 exec, exec
337; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
338; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
339; GFX8-NEXT:    s_nop 1
340; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
341; GFX8-NEXT:    s_nop 1
342; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
343; GFX8-NEXT:    s_nop 1
344; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
345; GFX8-NEXT:    s_nop 1
346; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
347; GFX8-NEXT:    s_nop 1
348; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
349; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
350; GFX8-NEXT:    s_nop 0
351; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
352; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
353; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
354; GFX8-NEXT:    ; implicit-def: $vgpr0
355; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
356; GFX8-NEXT:    s_cbranch_execz .LBB2_2
357; GFX8-NEXT:  ; %bb.1:
358; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
359; GFX8-NEXT:    v_mov_b32_e32 v0, s6
360; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
361; GFX8-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
362; GFX8-NEXT:  .LBB2_2:
363; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
364; GFX8-NEXT:    s_waitcnt vmcnt(0)
365; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
366; GFX8-NEXT:    v_mov_b32_e32 v0, v1
367; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX8-NEXT:    v_mov_b32_e32 v4, s3
369; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
370; GFX8-NEXT:    v_mov_b32_e32 v3, s2
371; GFX8-NEXT:    flat_store_dword v[3:4], v0
372; GFX8-NEXT:    s_endpgm
373;
374; GFX9-LABEL: add_i32_varying_vdata:
375; GFX9:       ; %bb.0: ; %entry
376; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
377; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
378; GFX9-NEXT:    v_mov_b32_e32 v1, 0
379; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
380; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
381; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
382; GFX9-NEXT:    v_mov_b32_e32 v2, v0
383; GFX9-NEXT:    s_not_b64 exec, exec
384; GFX9-NEXT:    v_mov_b32_e32 v2, 0
385; GFX9-NEXT:    s_not_b64 exec, exec
386; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
387; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
388; GFX9-NEXT:    s_nop 1
389; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
390; GFX9-NEXT:    s_nop 1
391; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
392; GFX9-NEXT:    s_nop 1
393; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
394; GFX9-NEXT:    s_nop 1
395; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
396; GFX9-NEXT:    s_nop 1
397; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
398; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
399; GFX9-NEXT:    s_nop 0
400; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
401; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
402; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
403; GFX9-NEXT:    ; implicit-def: $vgpr0
404; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
405; GFX9-NEXT:    s_cbranch_execz .LBB2_2
406; GFX9-NEXT:  ; %bb.1:
407; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
408; GFX9-NEXT:    v_mov_b32_e32 v0, s6
409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX9-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
411; GFX9-NEXT:  .LBB2_2:
412; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
413; GFX9-NEXT:    s_waitcnt vmcnt(0)
414; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
415; GFX9-NEXT:    v_mov_b32_e32 v0, v1
416; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
417; GFX9-NEXT:    v_mov_b32_e32 v3, 0
418; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
419; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
420; GFX9-NEXT:    s_endpgm
421;
422; GFX10W64-LABEL: add_i32_varying_vdata:
423; GFX10W64:       ; %bb.0: ; %entry
424; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
425; GFX10W64-NEXT:    s_not_b64 exec, exec
426; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
427; GFX10W64-NEXT:    s_not_b64 exec, exec
428; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
429; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
430; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
431; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
432; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
433; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
434; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
435; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
436; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
437; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
438; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
439; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
440; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
441; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
442; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
443; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
444; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
445; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
446; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
447; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
448; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
449; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
450; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
451; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
452; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
453; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
454; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
455; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
456; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
457; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
458; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
459; GFX10W64-NEXT:    ; implicit-def: $vgpr0
460; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
461; GFX10W64-NEXT:    s_cbranch_execz .LBB2_2
462; GFX10W64-NEXT:  ; %bb.1:
463; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
464; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
465; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
466; GFX10W64-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
467; GFX10W64-NEXT:  .LBB2_2:
468; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
469; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
470; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
471; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
472; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
473; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
474; GFX10W64-NEXT:    v_add_nc_u32_e32 v0, s0, v0
475; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
476; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
477; GFX10W64-NEXT:    s_endpgm
478;
479; GFX10W32-LABEL: add_i32_varying_vdata:
480; GFX10W32:       ; %bb.0: ; %entry
481; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
482; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
483; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
484; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
485; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
486; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
487; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
488; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
489; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
490; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
491; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
492; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
493; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
494; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
495; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
496; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
497; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
498; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
499; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
500; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
501; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
502; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
503; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
504; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
505; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
506; GFX10W32-NEXT:    ; implicit-def: $vgpr0
507; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
508; GFX10W32-NEXT:    s_cbranch_execz .LBB2_2
509; GFX10W32-NEXT:  ; %bb.1:
510; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
511; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
512; GFX10W32-NEXT:    s_mov_b32 s5, s6
513; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
514; GFX10W32-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 glc
515; GFX10W32-NEXT:  .LBB2_2:
516; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
517; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
518; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
519; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
520; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
521; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
522; GFX10W32-NEXT:    v_add_nc_u32_e32 v0, s0, v0
523; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
525; GFX10W32-NEXT:    s_endpgm
526entry:
527  %lane = call i32 @llvm.amdgcn.workitem.id.x()
528  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
529  store i32 %old, i32 addrspace(1)* %out
530  ret void
531}
532
533define amdgpu_kernel void @add_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
534; GFX6-LABEL: add_i32_varying_offset:
535; GFX6:       ; %bb.0: ; %entry
536; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
537; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
538; GFX6-NEXT:    v_mov_b32_e32 v1, 1
539; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
540; GFX6-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
541; GFX6-NEXT:    s_mov_b32 s3, 0xf000
542; GFX6-NEXT:    s_mov_b32 s2, -1
543; GFX6-NEXT:    s_waitcnt vmcnt(0)
544; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
545; GFX6-NEXT:    s_endpgm
546;
547; GFX8-LABEL: add_i32_varying_offset:
548; GFX8:       ; %bb.0: ; %entry
549; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
550; GFX8-NEXT:    v_mov_b32_e32 v2, 1
551; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
552; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX8-NEXT:    buffer_atomic_add v2, v0, s[4:7], 0 offen glc
554; GFX8-NEXT:    v_mov_b32_e32 v0, s0
555; GFX8-NEXT:    v_mov_b32_e32 v1, s1
556; GFX8-NEXT:    s_waitcnt vmcnt(0)
557; GFX8-NEXT:    flat_store_dword v[0:1], v2
558; GFX8-NEXT:    s_endpgm
559;
560; GFX9-LABEL: add_i32_varying_offset:
561; GFX9:       ; %bb.0: ; %entry
562; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
563; GFX9-NEXT:    v_mov_b32_e32 v1, 1
564; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
565; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX9-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
567; GFX9-NEXT:    v_mov_b32_e32 v0, 0
568; GFX9-NEXT:    s_waitcnt vmcnt(0)
569; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
570; GFX9-NEXT:    s_endpgm
571;
572; GFX10-LABEL: add_i32_varying_offset:
573; GFX10:       ; %bb.0: ; %entry
574; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
575; GFX10-NEXT:    v_mov_b32_e32 v1, 1
576; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
577; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX10-NEXT:    buffer_atomic_add v1, v0, s[4:7], 0 offen glc
579; GFX10-NEXT:    v_mov_b32_e32 v0, 0
580; GFX10-NEXT:    s_waitcnt vmcnt(0)
581; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
582; GFX10-NEXT:    s_endpgm
583entry:
584  %lane = call i32 @llvm.amdgcn.workitem.id.x()
585  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.add(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
586  store i32 %old, i32 addrspace(1)* %out
587  ret void
588}
589
590define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) {
591; GFX6-LABEL: sub_i32_constant:
592; GFX6:       ; %bb.0: ; %entry
593; GFX6-NEXT:    s_mov_b64 s[2:3], exec
594; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
595; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
596; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
597; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
598; GFX6-NEXT:    ; implicit-def: $vgpr1
599; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
600; GFX6-NEXT:    s_cbranch_execz .LBB4_2
601; GFX6-NEXT:  ; %bb.1:
602; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
603; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
604; GFX6-NEXT:    s_mul_i32 s0, s0, 5
605; GFX6-NEXT:    v_mov_b32_e32 v1, s0
606; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
607; GFX6-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
608; GFX6-NEXT:  .LBB4_2:
609; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
610; GFX6-NEXT:    s_waitcnt vmcnt(0)
611; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
612; GFX6-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
613; GFX6-NEXT:    s_mov_b32 s7, 0xf000
614; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
615; GFX6-NEXT:    s_mov_b32 s6, -1
616; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
617; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
618; GFX6-NEXT:    s_endpgm
619;
620; GFX8-LABEL: sub_i32_constant:
621; GFX8:       ; %bb.0: ; %entry
622; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
623; GFX8-NEXT:    s_mov_b64 s[6:7], exec
624; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
625; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
626; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
627; GFX8-NEXT:    ; implicit-def: $vgpr1
628; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
629; GFX8-NEXT:    s_cbranch_execz .LBB4_2
630; GFX8-NEXT:  ; %bb.1:
631; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
632; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
633; GFX8-NEXT:    s_mul_i32 s0, s0, 5
634; GFX8-NEXT:    v_mov_b32_e32 v1, s0
635; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
636; GFX8-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
637; GFX8-NEXT:  .LBB4_2:
638; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
639; GFX8-NEXT:    s_waitcnt vmcnt(0)
640; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
641; GFX8-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
642; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
643; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
644; GFX8-NEXT:    v_mov_b32_e32 v0, s2
645; GFX8-NEXT:    v_mov_b32_e32 v1, s3
646; GFX8-NEXT:    flat_store_dword v[0:1], v2
647; GFX8-NEXT:    s_endpgm
648;
649; GFX9-LABEL: sub_i32_constant:
650; GFX9:       ; %bb.0: ; %entry
651; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
652; GFX9-NEXT:    s_mov_b64 s[6:7], exec
653; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
654; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
655; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
656; GFX9-NEXT:    ; implicit-def: $vgpr1
657; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
658; GFX9-NEXT:    s_cbranch_execz .LBB4_2
659; GFX9-NEXT:  ; %bb.1:
660; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
661; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
662; GFX9-NEXT:    s_mul_i32 s0, s0, 5
663; GFX9-NEXT:    v_mov_b32_e32 v1, s0
664; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
665; GFX9-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
666; GFX9-NEXT:  .LBB4_2:
667; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
668; GFX9-NEXT:    s_waitcnt vmcnt(0)
669; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
670; GFX9-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
671; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
672; GFX9-NEXT:    v_mov_b32_e32 v1, 0
673; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
675; GFX9-NEXT:    s_endpgm
676;
677; GFX10W64-LABEL: sub_i32_constant:
678; GFX10W64:       ; %bb.0: ; %entry
679; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
680; GFX10W64-NEXT:    s_mov_b64 s[6:7], exec
681; GFX10W64-NEXT:    ; implicit-def: $vgpr1
682; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
683; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
684; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
685; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
686; GFX10W64-NEXT:    s_cbranch_execz .LBB4_2
687; GFX10W64-NEXT:  ; %bb.1:
688; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
689; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[6:7]
690; GFX10W64-NEXT:    s_mul_i32 s0, s0, 5
691; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
692; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
693; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
694; GFX10W64-NEXT:  .LBB4_2:
695; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
696; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
697; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
698; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
699; GFX10W64-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
700; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
701; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
702; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
703; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
704; GFX10W64-NEXT:    s_endpgm
705;
706; GFX10W32-LABEL: sub_i32_constant:
707; GFX10W32:       ; %bb.0: ; %entry
708; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
709; GFX10W32-NEXT:    s_mov_b32 s5, exec_lo
710; GFX10W32-NEXT:    ; implicit-def: $vgpr1
711; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
712; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
713; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
714; GFX10W32-NEXT:    s_cbranch_execz .LBB4_2
715; GFX10W32-NEXT:  ; %bb.1:
716; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
717; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s5
718; GFX10W32-NEXT:    s_mul_i32 s0, s0, 5
719; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
720; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
721; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
722; GFX10W32-NEXT:  .LBB4_2:
723; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
724; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
725; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
726; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
727; GFX10W32-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
728; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
729; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
730; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
731; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
732; GFX10W32-NEXT:    s_endpgm
733entry:
734  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 5, <4 x i32> %inout, i32 0, i32 0, i32 0)
735  store i32 %old, i32 addrspace(1)* %out
736  ret void
737}
738
739define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, <4 x i32> %inout, i32 %subitive) {
740; GFX6-LABEL: sub_i32_uniform:
741; GFX6:       ; %bb.0: ; %entry
742; GFX6-NEXT:    s_mov_b64 s[2:3], exec
743; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
744; GFX6-NEXT:    s_load_dword s8, s[0:1], 0x11
745; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, s2, 0
746; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, s3, v0
747; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
748; GFX6-NEXT:    ; implicit-def: $vgpr1
749; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
750; GFX6-NEXT:    s_cbranch_execz .LBB5_2
751; GFX6-NEXT:  ; %bb.1:
752; GFX6-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0xd
753; GFX6-NEXT:    s_bcnt1_i32_b64 s0, s[2:3]
754; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
755; GFX6-NEXT:    s_mul_i32 s0, s8, s0
756; GFX6-NEXT:    v_mov_b32_e32 v1, s0
757; GFX6-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
758; GFX6-NEXT:  .LBB5_2:
759; GFX6-NEXT:    s_or_b64 exec, exec, s[6:7]
760; GFX6-NEXT:    s_waitcnt vmcnt(0)
761; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
762; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
763; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
764; GFX6-NEXT:    s_mov_b32 s7, 0xf000
765; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
766; GFX6-NEXT:    s_mov_b32 s6, -1
767; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
768; GFX6-NEXT:    s_endpgm
769;
770; GFX8-LABEL: sub_i32_uniform:
771; GFX8:       ; %bb.0: ; %entry
772; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
773; GFX8-NEXT:    s_load_dword s8, s[0:1], 0x44
774; GFX8-NEXT:    s_mov_b64 s[4:5], exec
775; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
776; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
777; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
778; GFX8-NEXT:    ; implicit-def: $vgpr1
779; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
780; GFX8-NEXT:    s_cbranch_execz .LBB5_2
781; GFX8-NEXT:  ; %bb.1:
782; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
783; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
784; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
785; GFX8-NEXT:    s_mul_i32 s0, s8, s0
786; GFX8-NEXT:    v_mov_b32_e32 v1, s0
787; GFX8-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
788; GFX8-NEXT:  .LBB5_2:
789; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
790; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
791; GFX8-NEXT:    v_mul_lo_u32 v0, s8, v0
792; GFX8-NEXT:    s_waitcnt vmcnt(0)
793; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
794; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s0, v0
795; GFX8-NEXT:    v_mov_b32_e32 v0, s2
796; GFX8-NEXT:    v_mov_b32_e32 v1, s3
797; GFX8-NEXT:    flat_store_dword v[0:1], v2
798; GFX8-NEXT:    s_endpgm
799;
800; GFX9-LABEL: sub_i32_uniform:
801; GFX9:       ; %bb.0: ; %entry
802; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
803; GFX9-NEXT:    s_load_dword s8, s[0:1], 0x44
804; GFX9-NEXT:    s_mov_b64 s[4:5], exec
805; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
806; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
807; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
808; GFX9-NEXT:    ; implicit-def: $vgpr1
809; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
810; GFX9-NEXT:    s_cbranch_execz .LBB5_2
811; GFX9-NEXT:  ; %bb.1:
812; GFX9-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
813; GFX9-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
814; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
815; GFX9-NEXT:    s_mul_i32 s0, s8, s0
816; GFX9-NEXT:    v_mov_b32_e32 v1, s0
817; GFX9-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
818; GFX9-NEXT:  .LBB5_2:
819; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
820; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
821; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v0
822; GFX9-NEXT:    s_waitcnt vmcnt(0)
823; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
824; GFX9-NEXT:    v_mov_b32_e32 v1, 0
825; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
826; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
827; GFX9-NEXT:    s_endpgm
828;
829; GFX10W64-LABEL: sub_i32_uniform:
830; GFX10W64:       ; %bb.0: ; %entry
831; GFX10W64-NEXT:    s_clause 0x1
832; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
833; GFX10W64-NEXT:    s_load_dword s8, s[0:1], 0x44
834; GFX10W64-NEXT:    s_mov_b64 s[4:5], exec
835; GFX10W64-NEXT:    ; implicit-def: $vgpr1
836; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
837; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
838; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
839; GFX10W64-NEXT:    s_and_saveexec_b64 s[6:7], vcc
840; GFX10W64-NEXT:    s_cbranch_execz .LBB5_2
841; GFX10W64-NEXT:  ; %bb.1:
842; GFX10W64-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x34
843; GFX10W64-NEXT:    s_bcnt1_i32_b64 s0, s[4:5]
844; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
845; GFX10W64-NEXT:    s_mul_i32 s0, s8, s0
846; GFX10W64-NEXT:    v_mov_b32_e32 v1, s0
847; GFX10W64-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 glc
848; GFX10W64-NEXT:  .LBB5_2:
849; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
850; GFX10W64-NEXT:    s_or_b64 exec, exec, s[6:7]
851; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
852; GFX10W64-NEXT:    v_mul_lo_u32 v0, s8, v0
853; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
854; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v1
855; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
856; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
857; GFX10W64-NEXT:    global_store_dword v1, v0, s[2:3]
858; GFX10W64-NEXT:    s_endpgm
859;
860; GFX10W32-LABEL: sub_i32_uniform:
861; GFX10W32:       ; %bb.0: ; %entry
862; GFX10W32-NEXT:    s_clause 0x1
863; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
864; GFX10W32-NEXT:    s_load_dword s4, s[0:1], 0x44
865; GFX10W32-NEXT:    s_mov_b32 s6, exec_lo
866; GFX10W32-NEXT:    ; implicit-def: $vgpr1
867; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
868; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
869; GFX10W32-NEXT:    s_and_saveexec_b32 s5, vcc_lo
870; GFX10W32-NEXT:    s_cbranch_execz .LBB5_2
871; GFX10W32-NEXT:  ; %bb.1:
872; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
873; GFX10W32-NEXT:    s_bcnt1_i32_b32 s0, s6
874; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
875; GFX10W32-NEXT:    s_mul_i32 s0, s4, s0
876; GFX10W32-NEXT:    v_mov_b32_e32 v1, s0
877; GFX10W32-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 glc
878; GFX10W32-NEXT:  .LBB5_2:
879; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
880; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s5
881; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
882; GFX10W32-NEXT:    v_mul_lo_u32 v0, s4, v0
883; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
884; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v1
885; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
886; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
887; GFX10W32-NEXT:    global_store_dword v1, v0, s[2:3]
888; GFX10W32-NEXT:    s_endpgm
889entry:
890  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %subitive, <4 x i32> %inout, i32 0, i32 0, i32 0)
891  store i32 %old, i32 addrspace(1)* %out
892  ret void
893}
894
895define amdgpu_kernel void @sub_i32_varying_vdata(i32 addrspace(1)* %out, <4 x i32> %inout) {
896; GFX6-LABEL: sub_i32_varying_vdata:
897; GFX6:       ; %bb.0: ; %entry
898; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
899; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
900; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
901; GFX6-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 glc
902; GFX6-NEXT:    s_mov_b32 s3, 0xf000
903; GFX6-NEXT:    s_mov_b32 s2, -1
904; GFX6-NEXT:    s_waitcnt vmcnt(0)
905; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
906; GFX6-NEXT:    s_endpgm
907;
908; GFX8-LABEL: sub_i32_varying_vdata:
909; GFX8:       ; %bb.0: ; %entry
910; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
911; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
912; GFX8-NEXT:    v_mov_b32_e32 v1, 0
913; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
914; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
915; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
916; GFX8-NEXT:    v_mov_b32_e32 v2, v0
917; GFX8-NEXT:    s_not_b64 exec, exec
918; GFX8-NEXT:    v_mov_b32_e32 v2, 0
919; GFX8-NEXT:    s_not_b64 exec, exec
920; GFX8-NEXT:    s_or_saveexec_b64 s[4:5], -1
921; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
922; GFX8-NEXT:    s_nop 1
923; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
924; GFX8-NEXT:    s_nop 1
925; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
926; GFX8-NEXT:    s_nop 1
927; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
928; GFX8-NEXT:    s_nop 1
929; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
930; GFX8-NEXT:    s_nop 1
931; GFX8-NEXT:    v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
932; GFX8-NEXT:    v_readlane_b32 s6, v2, 63
933; GFX8-NEXT:    s_nop 0
934; GFX8-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
935; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
936; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
937; GFX8-NEXT:    ; implicit-def: $vgpr0
938; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
939; GFX8-NEXT:    s_cbranch_execz .LBB6_2
940; GFX8-NEXT:  ; %bb.1:
941; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
942; GFX8-NEXT:    v_mov_b32_e32 v0, s6
943; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
944; GFX8-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
945; GFX8-NEXT:  .LBB6_2:
946; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
947; GFX8-NEXT:    s_waitcnt vmcnt(0)
948; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
949; GFX8-NEXT:    v_mov_b32_e32 v0, v1
950; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX8-NEXT:    v_mov_b32_e32 v4, s3
952; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s0, v0
953; GFX8-NEXT:    v_mov_b32_e32 v3, s2
954; GFX8-NEXT:    flat_store_dword v[3:4], v0
955; GFX8-NEXT:    s_endpgm
956;
957; GFX9-LABEL: sub_i32_varying_vdata:
958; GFX9:       ; %bb.0: ; %entry
959; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
960; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
961; GFX9-NEXT:    v_mov_b32_e32 v1, 0
962; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
963; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
964; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
965; GFX9-NEXT:    v_mov_b32_e32 v2, v0
966; GFX9-NEXT:    s_not_b64 exec, exec
967; GFX9-NEXT:    v_mov_b32_e32 v2, 0
968; GFX9-NEXT:    s_not_b64 exec, exec
969; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
970; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
971; GFX9-NEXT:    s_nop 1
972; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
973; GFX9-NEXT:    s_nop 1
974; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
975; GFX9-NEXT:    s_nop 1
976; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
977; GFX9-NEXT:    s_nop 1
978; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
979; GFX9-NEXT:    s_nop 1
980; GFX9-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
981; GFX9-NEXT:    v_readlane_b32 s6, v2, 63
982; GFX9-NEXT:    s_nop 0
983; GFX9-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
984; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
985; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
986; GFX9-NEXT:    ; implicit-def: $vgpr0
987; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
988; GFX9-NEXT:    s_cbranch_execz .LBB6_2
989; GFX9-NEXT:  ; %bb.1:
990; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
991; GFX9-NEXT:    v_mov_b32_e32 v0, s6
992; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
993; GFX9-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
994; GFX9-NEXT:  .LBB6_2:
995; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
996; GFX9-NEXT:    s_waitcnt vmcnt(0)
997; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
998; GFX9-NEXT:    v_mov_b32_e32 v0, v1
999; GFX9-NEXT:    v_sub_u32_e32 v0, s0, v0
1000; GFX9-NEXT:    v_mov_b32_e32 v3, 0
1001; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1002; GFX9-NEXT:    global_store_dword v3, v0, s[2:3]
1003; GFX9-NEXT:    s_endpgm
1004;
1005; GFX10W64-LABEL: sub_i32_varying_vdata:
1006; GFX10W64:       ; %bb.0: ; %entry
1007; GFX10W64-NEXT:    v_mov_b32_e32 v1, v0
1008; GFX10W64-NEXT:    s_not_b64 exec, exec
1009; GFX10W64-NEXT:    v_mov_b32_e32 v1, 0
1010; GFX10W64-NEXT:    s_not_b64 exec, exec
1011; GFX10W64-NEXT:    s_or_saveexec_b64 s[2:3], -1
1012; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1013; GFX10W64-NEXT:    v_mov_b32_e32 v3, 0
1014; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1015; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1016; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1017; GFX10W64-NEXT:    v_mov_b32_e32 v2, v1
1018; GFX10W64-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1019; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1020; GFX10W64-NEXT:    v_readlane_b32 s4, v1, 31
1021; GFX10W64-NEXT:    v_mov_b32_e32 v2, s4
1022; GFX10W64-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf
1023; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 15
1024; GFX10W64-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1025; GFX10W64-NEXT:    s_mov_b64 exec, s[2:3]
1026; GFX10W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1027; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1028; GFX10W64-NEXT:    v_readlane_b32 s7, v1, 31
1029; GFX10W64-NEXT:    v_writelane_b32 v3, s6, 16
1030; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1031; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1032; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1033; GFX10W64-NEXT:    v_readlane_b32 s6, v1, 63
1034; GFX10W64-NEXT:    v_readlane_b32 s8, v1, 47
1035; GFX10W64-NEXT:    v_writelane_b32 v3, s7, 32
1036; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1037; GFX10W64-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1038; GFX10W64-NEXT:    s_or_saveexec_b64 s[4:5], -1
1039; GFX10W64-NEXT:    v_writelane_b32 v3, s8, 48
1040; GFX10W64-NEXT:    s_mov_b64 exec, s[4:5]
1041; GFX10W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1042; GFX10W64-NEXT:    ; implicit-def: $vgpr0
1043; GFX10W64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
1044; GFX10W64-NEXT:    s_cbranch_execz .LBB6_2
1045; GFX10W64-NEXT:  ; %bb.1:
1046; GFX10W64-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1047; GFX10W64-NEXT:    v_mov_b32_e32 v0, s6
1048; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1049; GFX10W64-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1050; GFX10W64-NEXT:  .LBB6_2:
1051; GFX10W64-NEXT:    s_waitcnt_depctr 0xffe3
1052; GFX10W64-NEXT:    s_or_b64 exec, exec, s[4:5]
1053; GFX10W64-NEXT:    s_waitcnt vmcnt(0)
1054; GFX10W64-NEXT:    v_readfirstlane_b32 s0, v0
1055; GFX10W64-NEXT:    v_mov_b32_e32 v0, v3
1056; GFX10W64-NEXT:    v_mov_b32_e32 v4, 0
1057; GFX10W64-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1058; GFX10W64-NEXT:    s_waitcnt lgkmcnt(0)
1059; GFX10W64-NEXT:    global_store_dword v4, v0, s[2:3]
1060; GFX10W64-NEXT:    s_endpgm
1061;
1062; GFX10W32-LABEL: sub_i32_varying_vdata:
1063; GFX10W32:       ; %bb.0: ; %entry
1064; GFX10W32-NEXT:    v_mov_b32_e32 v1, v0
1065; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1066; GFX10W32-NEXT:    v_mov_b32_e32 v1, 0
1067; GFX10W32-NEXT:    s_not_b32 exec_lo, exec_lo
1068; GFX10W32-NEXT:    s_or_saveexec_b32 s2, -1
1069; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
1070; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
1071; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
1072; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
1073; GFX10W32-NEXT:    v_mov_b32_e32 v2, v1
1074; GFX10W32-NEXT:    v_permlanex16_b32 v2, v2, -1, -1
1075; GFX10W32-NEXT:    s_mov_b32 exec_lo, s2
1076; GFX10W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
1077; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1078; GFX10W32-NEXT:    v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf
1079; GFX10W32-NEXT:    v_mov_b32_e32 v3, 0
1080; GFX10W32-NEXT:    v_readlane_b32 s6, v1, 31
1081; GFX10W32-NEXT:    v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf
1082; GFX10W32-NEXT:    v_readlane_b32 s5, v1, 15
1083; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1084; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1085; GFX10W32-NEXT:    s_or_saveexec_b32 s4, -1
1086; GFX10W32-NEXT:    v_writelane_b32 v3, s5, 16
1087; GFX10W32-NEXT:    s_mov_b32 exec_lo, s4
1088; GFX10W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1089; GFX10W32-NEXT:    ; implicit-def: $vgpr0
1090; GFX10W32-NEXT:    s_and_saveexec_b32 s4, vcc_lo
1091; GFX10W32-NEXT:    s_cbranch_execz .LBB6_2
1092; GFX10W32-NEXT:  ; %bb.1:
1093; GFX10W32-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x34
1094; GFX10W32-NEXT:    v_mov_b32_e32 v0, s6
1095; GFX10W32-NEXT:    s_mov_b32 s5, s6
1096; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1097; GFX10W32-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 glc
1098; GFX10W32-NEXT:  .LBB6_2:
1099; GFX10W32-NEXT:    s_waitcnt_depctr 0xffe3
1100; GFX10W32-NEXT:    s_or_b32 exec_lo, exec_lo, s4
1101; GFX10W32-NEXT:    s_waitcnt vmcnt(0)
1102; GFX10W32-NEXT:    v_readfirstlane_b32 s0, v0
1103; GFX10W32-NEXT:    v_mov_b32_e32 v0, v3
1104; GFX10W32-NEXT:    v_mov_b32_e32 v4, 0
1105; GFX10W32-NEXT:    v_sub_nc_u32_e32 v0, s0, v0
1106; GFX10W32-NEXT:    s_waitcnt lgkmcnt(0)
1107; GFX10W32-NEXT:    global_store_dword v4, v0, s[2:3]
1108; GFX10W32-NEXT:    s_endpgm
1109entry:
1110  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1111  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 %lane, <4 x i32> %inout, i32 0, i32 0, i32 0)
1112  store i32 %old, i32 addrspace(1)* %out
1113  ret void
1114}
1115
1116define amdgpu_kernel void @sub_i32_varying_offset(i32 addrspace(1)* %out, <4 x i32> %inout) {
1117; GFX6-LABEL: sub_i32_varying_offset:
1118; GFX6:       ; %bb.0: ; %entry
1119; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
1120; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
1121; GFX6-NEXT:    v_mov_b32_e32 v1, 1
1122; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX6-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1124; GFX6-NEXT:    s_mov_b32 s3, 0xf000
1125; GFX6-NEXT:    s_mov_b32 s2, -1
1126; GFX6-NEXT:    s_waitcnt vmcnt(0)
1127; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
1128; GFX6-NEXT:    s_endpgm
1129;
1130; GFX8-LABEL: sub_i32_varying_offset:
1131; GFX8:       ; %bb.0: ; %entry
1132; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1133; GFX8-NEXT:    v_mov_b32_e32 v2, 1
1134; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1135; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
1136; GFX8-NEXT:    buffer_atomic_sub v2, v0, s[4:7], 0 offen glc
1137; GFX8-NEXT:    v_mov_b32_e32 v0, s0
1138; GFX8-NEXT:    v_mov_b32_e32 v1, s1
1139; GFX8-NEXT:    s_waitcnt vmcnt(0)
1140; GFX8-NEXT:    flat_store_dword v[0:1], v2
1141; GFX8-NEXT:    s_endpgm
1142;
1143; GFX9-LABEL: sub_i32_varying_offset:
1144; GFX9:       ; %bb.0: ; %entry
1145; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1146; GFX9-NEXT:    v_mov_b32_e32 v1, 1
1147; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1148; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1149; GFX9-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1150; GFX9-NEXT:    v_mov_b32_e32 v0, 0
1151; GFX9-NEXT:    s_waitcnt vmcnt(0)
1152; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
1153; GFX9-NEXT:    s_endpgm
1154;
1155; GFX10-LABEL: sub_i32_varying_offset:
1156; GFX10:       ; %bb.0: ; %entry
1157; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
1158; GFX10-NEXT:    v_mov_b32_e32 v1, 1
1159; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
1160; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1161; GFX10-NEXT:    buffer_atomic_sub v1, v0, s[4:7], 0 offen glc
1162; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1163; GFX10-NEXT:    s_waitcnt vmcnt(0)
1164; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
1165; GFX10-NEXT:    s_endpgm
1166entry:
1167  %lane = call i32 @llvm.amdgcn.workitem.id.x()
1168  %old = call i32 @llvm.amdgcn.raw.buffer.atomic.sub(i32 1, <4 x i32> %inout, i32 %lane, i32 0, i32 0)
1169  store i32 %old, i32 addrspace(1)* %out
1170  ret void
1171}
1172